Switch from return_tuple to return_dict (#6138)
* Switch from return_tuple to return_dict
* Fix test
* [WIP] Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleC… (#5614)
* Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleChoice} models and tests
* AutoModels
Tiny tweaks
* Style
* Final changes before merge
* Re-order for simpler review
* Final fixes
* Addressing @sgugger's comments
* Test MultipleChoice
* Rework TF trainer (#6038)
* Fully rework training/prediction loops
* fix method name
* Fix variable name
* Fix property name
* Fix scope
* Fix method name
* Fix tuple index
* Fix tuple index
* Fix indentation
* Fix variable name
* fix eval before log
* Add drop remainder for test dataset
* Fix step number + fix logging datetime
* fix eval loss value
* use global step instead of step + fix logging at step 0
* Fix logging datetime
* Fix global_step usage
* Fix breaking loop + logging datetime
* Fix step in prediction loop
* Fix step breaking
* Fix train/test loops
* Force TF at least 2.2 for the trainer
* Use assert_cardinality to facilitate the dataset size computation
* Log steps per epoch
* Make tfds compliant with TPU
* Make tfds compliant with TPU
* Use TF dataset enumerate instead of the Python one
* revert previous commit
* Fix data_dir
* Apply style
* rebase on master
* Address Sylvain's comments
* Address Sylvain's and Lysandre comments
* Trigger CI
* Remove unused import
* Switch from return_tuple to return_dict
* Fix test
* Add recent model
Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
Co-authored-by: Julien Plu <plu.julien@gmail.com>
This commit is contained in:
@@ -230,19 +230,16 @@ final activations of the model.
|
||||
|
||||
>>> ## PYTORCH CODE
|
||||
>>> print(pt_outputs)
|
||||
SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833, 4.3364],
|
||||
[ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
|
||||
(tensor([[-4.0833, 4.3364],
|
||||
[ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
|
||||
>>> ## TENSORFLOW CODE
|
||||
>>> print(tf_outputs)
|
||||
(<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
|
||||
array([[-4.0832963 , 4.336414 ],
|
||||
[ 0.08181786, -0.04179301]], dtype=float32)>,)
|
||||
|
||||
The model can return more than just the final activations, which is why the PyTorch output is a special class and the
|
||||
TensorFlow output is a tuple. Here we only asked for the final activations, so we get a tuple with one element on the
|
||||
TensorFlow side and a :class:`~transformers.modeling_outputs.SequenceClassifierOutput` with just the ``logits`` field
|
||||
filled on the PyTorch side.
|
||||
|
||||
The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for
|
||||
the final activations, so we get a tuple with one element.
|
||||
.. note::
|
||||
|
||||
All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final
|
||||
@@ -254,7 +251,7 @@ Let's apply the SoftMax activation to get predictions.
|
||||
|
||||
>>> ## PYTORCH CODE
|
||||
>>> import torch.nn.functional as F
|
||||
>>> pt_predictions = F.softmax(pt_outputs.logits, dim=-1)
|
||||
>>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
|
||||
>>> ## TENSORFLOW CODE
|
||||
>>> import tensorflow as tf
|
||||
>>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
|
||||
@@ -342,7 +339,7 @@ code is easy to access and tweak if you need to.
|
||||
In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's
|
||||
using the :doc:`DistilBERT </model_doc/distilbert>` architecture. As
|
||||
:class:`~transformers.AutoModelForSequenceClassification` (or :class:`~transformers.TFAutoModelForSequenceClassification`
|
||||
if you are using TensorFlow)` was used, the model automatically created is then a
|
||||
if you are using TensorFlow) was used, the model automatically created is then a
|
||||
:class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant
|
||||
to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
|
||||
without the auto magic:
|
||||
|
||||
@@ -49,7 +49,7 @@ put it in train mode.
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import BertForSequenceClassification
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True)
|
||||
model.train()
|
||||
|
||||
This is useful because it allows us to make use of the pre-trained BERT
|
||||
|
||||
@@ -199,9 +199,6 @@ def train(args, train_dataset, model, tokenizer):
|
||||
{"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
|
||||
)
|
||||
|
||||
if isinstance(model, torch.nn.DataParallel):
|
||||
inputs["return_tuple"] = True
|
||||
|
||||
outputs = model(**inputs)
|
||||
# model outputs are always tuple in transformers (see doc)
|
||||
loss = outputs[0]
|
||||
@@ -316,8 +313,6 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
inputs.update(
|
||||
{"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
|
||||
)
|
||||
if isinstance(model, torch.nn.DataParallel):
|
||||
inputs["return_tuple"] = True
|
||||
outputs = model(**inputs)
|
||||
|
||||
for i, feature_index in enumerate(feature_indices):
|
||||
|
||||
@@ -144,7 +144,7 @@ class TestSummarizationDistiller(unittest.TestCase):
|
||||
evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp()))
|
||||
|
||||
def test_loss_fn(self):
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY)
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY, return_dict=True)
|
||||
input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"]
|
||||
target_ids = torch.tensor([[0, 4, 8, 2], [0, 8, 2, 1]], dtype=torch.long, device=model.device)
|
||||
decoder_input_ids = target_ids[:, :-1].contiguous() # Why this line?
|
||||
|
||||
@@ -49,8 +49,9 @@ class PretrainedConfig(object):
|
||||
Whether or not the model should returns all attentions.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return tuples instead of :obj:`ModelOutput` objects.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether the model is used as an encoder/decoder or not.
|
||||
is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
@@ -133,7 +134,7 @@ class PretrainedConfig(object):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
# Attributes with defaults
|
||||
self.return_tuple = kwargs.pop("return_tuple", False)
|
||||
self.return_dict = kwargs.pop("return_dict", False)
|
||||
self.output_hidden_states = kwargs.pop("output_hidden_states", False)
|
||||
self.output_attentions = kwargs.pop("output_attentions", False)
|
||||
self.use_cache = kwargs.pop("use_cache", True) # Not used by all models
|
||||
@@ -194,12 +195,12 @@ class PretrainedConfig(object):
|
||||
raise err
|
||||
|
||||
@property
|
||||
def use_return_tuple(self) -> bool:
|
||||
def use_return_dict(self) -> bool:
|
||||
"""
|
||||
:obj:`bool`: Whether or not the model should return a tuple.
|
||||
:obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples.
|
||||
"""
|
||||
# If torchscript is set, force return_tuple to avoid jit errors
|
||||
return self.return_tuple or self.torchscript
|
||||
# If torchscript is set, force `return_dict=False` to avoid jit errors
|
||||
return self.return_dict and not self.torchscript
|
||||
|
||||
@property
|
||||
def num_labels(self) -> int:
|
||||
|
||||
@@ -13,14 +13,17 @@ import shutil
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import fields
|
||||
from functools import partial, wraps
|
||||
from hashlib import sha256
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Union
|
||||
from typing import Any, Dict, Optional, Tuple, Union
|
||||
from urllib.parse import urlparse
|
||||
from zipfile import ZipFile, is_zipfile
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from filelock import FileLock
|
||||
from tqdm.auto import tqdm
|
||||
@@ -190,8 +193,8 @@ def add_end_docstrings(*docstr):
|
||||
RETURN_INTRODUCTION = r"""
|
||||
Returns:
|
||||
:class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`:
|
||||
A :class:`~{full_output_type}` or a tuple of :obj:`torch.FloatTensor` (if ``return_tuple=True`` is passed or
|
||||
when ``config.return_tuple=True``) comprising various elements depending on the configuration
|
||||
A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a
|
||||
tuple of :obj:`torch.FloatTensor` comprising various elements depending on the configuration
|
||||
(:class:`~transformers.{config_class}`) and inputs.
|
||||
|
||||
"""
|
||||
@@ -257,7 +260,7 @@ PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
|
||||
@@ -274,7 +277,7 @@ PT_QUESTION_ANSWERING_SAMPLE = r"""
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> start_positions = torch.tensor([1])
|
||||
@@ -293,7 +296,7 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
@@ -309,7 +312,7 @@ PT_MASKED_LM_SAMPLE = r"""
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True)
|
||||
|
||||
>>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
|
||||
|
||||
@@ -325,7 +328,7 @@ PT_BASE_MODEL_SAMPLE = r"""
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
@@ -340,7 +343,7 @@ PT_MULTIPLE_CHOICE_SAMPLE = r"""
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True)
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> choice0 = "It is eaten with a fork and a knife."
|
||||
@@ -362,7 +365,7 @@ PT_CAUSAL_LM_SAMPLE = r"""
|
||||
>>> from transformers import {tokenizer_class}, {model_class}
|
||||
|
||||
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}')
|
||||
>>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs, labels=inputs["input_ids"])
|
||||
@@ -900,30 +903,91 @@ def tf_required(func):
|
||||
return wrapper
|
||||
|
||||
|
||||
class ModelOutput:
|
||||
def is_tensor(x):
|
||||
""" Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`. """
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if isinstance(x, torch.Tensor):
|
||||
return True
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
if isinstance(x, tf.Tensor):
|
||||
return True
|
||||
return isinstance(x, np.ndarray)
|
||||
|
||||
|
||||
class ModelOutput(OrderedDict):
|
||||
"""
|
||||
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
|
||||
a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes.
|
||||
a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes. Otherwise behaves like a
|
||||
regular python dictionary.
|
||||
|
||||
.. warning::
|
||||
You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple`
|
||||
method to convert it to a tuple before.
|
||||
"""
|
||||
|
||||
def to_tuple(self):
|
||||
def __post_init__(self):
|
||||
class_fields = fields(self)
|
||||
|
||||
# Safety and consistency checks
|
||||
assert len(class_fields), f"{self.__class__.__name__} has no fields."
|
||||
assert all(
|
||||
field.default is None for field in class_fields[1:]
|
||||
), f"{self.__class__.__name__} should not have more than one required field."
|
||||
|
||||
first_field = getattr(self, class_fields[0].name)
|
||||
other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
|
||||
|
||||
if other_fields_are_none and not is_tensor(first_field):
|
||||
try:
|
||||
iterator = iter(first_field)
|
||||
first_field_iterator = True
|
||||
except TypeError:
|
||||
first_field_iterator = False
|
||||
|
||||
# if we provided an iterator as first field and the iterator is a (key, value) iterator
|
||||
# set the associated fields
|
||||
if first_field_iterator:
|
||||
for element in iterator:
|
||||
if (
|
||||
not isinstance(element, (list, tuple))
|
||||
or not len(element) == 2
|
||||
or not isinstance(element[0], str)
|
||||
):
|
||||
break
|
||||
setattr(self, element[0], element[1])
|
||||
if element[1] is not None:
|
||||
self[element[0]] = element[1]
|
||||
else:
|
||||
for field in class_fields:
|
||||
v = getattr(self, field.name)
|
||||
if v is not None:
|
||||
self[field.name] = v
|
||||
|
||||
def __delitem__(self, *args, **kwargs):
|
||||
raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
|
||||
|
||||
def setdefault(self, *args, **kwargs):
|
||||
raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
|
||||
|
||||
def pop(self, *args, **kwargs):
|
||||
raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
|
||||
|
||||
def update(self, *args, **kwargs):
|
||||
raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
|
||||
|
||||
def __getitem__(self, k):
|
||||
if isinstance(k, str):
|
||||
inner_dict = {k: v for (k, v) in self.items()}
|
||||
return inner_dict[k]
|
||||
else:
|
||||
return self.to_tuple()[k]
|
||||
|
||||
def to_tuple(self) -> Tuple[Any]:
|
||||
"""
|
||||
Converts :obj:`self` to a tuple.
|
||||
|
||||
Return: A tuple containing all non-:obj:`None` attributes of the :obj:`self`.
|
||||
Convert self to a tuple containing all the attributes/keys that are not ``None``.
|
||||
"""
|
||||
return tuple(getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None)
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
Converts :obj:`self` to a Python dictionary.
|
||||
|
||||
Return: A dictionary containing all non-:obj:`None` attributes of the :obj:`self`.
|
||||
"""
|
||||
return {f: getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None}
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.to_dict()[i] if isinstance(i, str) else self.to_tuple()[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.to_tuple())
|
||||
return tuple(self[k] for k in self.keys())
|
||||
|
||||
@@ -346,7 +346,7 @@ class AlbertTransformer(nn.Module):
|
||||
head_mask=None,
|
||||
output_attentions=False,
|
||||
output_hidden_states=False,
|
||||
return_tuple=False,
|
||||
return_dict=False,
|
||||
):
|
||||
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
|
||||
|
||||
@@ -375,7 +375,7 @@ class AlbertTransformer(nn.Module):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
|
||||
@@ -430,9 +430,9 @@ class AlbertForPretrainingOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
prediction_logits: torch.FloatTensor
|
||||
sop_logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
prediction_logits: torch.FloatTensor = None
|
||||
sop_logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -488,8 +488,9 @@ ALBERT_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -561,13 +562,13 @@ class AlbertModel(AlbertPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -599,14 +600,14 @@ class AlbertModel(AlbertPreTrainedModel):
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
|
||||
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
@@ -653,7 +654,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
sentence_order_label=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -678,7 +679,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
>>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
|
||||
>>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True)
|
||||
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
@@ -695,7 +696,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids,
|
||||
@@ -706,7 +707,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output, pooled_output = outputs[:2]
|
||||
@@ -721,7 +722,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
|
||||
total_loss = masked_lm_loss + sentence_order_loss
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores, sop_scores) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -808,7 +809,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -827,7 +828,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids=input_ids,
|
||||
@@ -838,7 +839,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_outputs = outputs[0]
|
||||
|
||||
@@ -849,7 +850,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -895,7 +896,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -904,7 +905,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids=input_ids,
|
||||
@@ -915,7 +916,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -933,7 +934,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -976,14 +977,14 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids,
|
||||
@@ -994,7 +995,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1014,7 +1015,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1057,7 +1058,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1069,7 +1070,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids=input_ids,
|
||||
@@ -1080,7 +1081,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1107,7 +1108,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -1153,7 +1154,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1161,7 +1162,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -1182,7 +1183,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -1196,7 +1197,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -124,8 +124,9 @@ BART_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -304,7 +305,7 @@ class BartEncoder(nn.Module):
|
||||
self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None
|
||||
|
||||
def forward(
|
||||
self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_tuple=False
|
||||
self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@@ -359,7 +360,7 @@ class BartEncoder(nn.Module):
|
||||
# T x B x C -> B x T x C
|
||||
x = x.transpose(0, 1)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [x, encoder_states, all_attentions] if v is not None)
|
||||
return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)
|
||||
|
||||
@@ -495,7 +496,7 @@ class BartDecoder(nn.Module):
|
||||
use_cache=False,
|
||||
output_attentions=False,
|
||||
output_hidden_states=False,
|
||||
return_tuple=False,
|
||||
return_dict=False,
|
||||
**unused,
|
||||
):
|
||||
"""
|
||||
@@ -588,7 +589,7 @@ class BartDecoder(nn.Module):
|
||||
else:
|
||||
next_cache = None
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [x, next_cache, all_hidden_states, all_self_attns] if v is not None)
|
||||
return BaseModelOutputWithPast(
|
||||
last_hidden_state=x, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns
|
||||
@@ -850,7 +851,7 @@ class BartModel(PretrainedBartModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
@@ -862,7 +863,7 @@ class BartModel(PretrainedBartModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# make masks if user doesn't supply
|
||||
if not use_cache:
|
||||
@@ -884,10 +885,10 @@ class BartModel(PretrainedBartModel):
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_tuple=False
|
||||
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
|
||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
|
||||
@@ -905,10 +906,10 @@ class BartModel(PretrainedBartModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return decoder_outputs + encoder_outputs
|
||||
|
||||
return Seq2SeqModelOutput(
|
||||
@@ -976,7 +977,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**unused,
|
||||
):
|
||||
r"""
|
||||
@@ -1018,7 +1019,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
FutureWarning,
|
||||
)
|
||||
decoder_past_key_values = unused.pop("decoder_cached_states")
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None:
|
||||
use_cache = False
|
||||
@@ -1033,7 +1034,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
|
||||
|
||||
@@ -1043,7 +1044,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
# TODO(SS): do we need to ignore pad tokens in labels?
|
||||
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (lm_logits,) + outputs[1:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -1146,7 +1147,7 @@ class BartForSequenceClassification(PretrainedBartModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1154,7 +1155,7 @@ class BartForSequenceClassification(PretrainedBartModel):
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
if labels is not None:
|
||||
use_cache = False
|
||||
|
||||
@@ -1167,7 +1168,7 @@ class BartForSequenceClassification(PretrainedBartModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
x = outputs[0] # last hidden state
|
||||
eos_mask = input_ids.eq(self.config.eos_token_id)
|
||||
@@ -1180,7 +1181,7 @@ class BartForSequenceClassification(PretrainedBartModel):
|
||||
if labels is not None:
|
||||
loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1232,7 +1233,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1244,7 +1245,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
if start_positions is not None and end_positions is not None:
|
||||
use_cache = False
|
||||
|
||||
@@ -1257,7 +1258,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1284,7 +1285,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits,) + outputs[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
|
||||
@@ -429,7 +429,7 @@ class BertEncoder(nn.Module):
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=False,
|
||||
output_hidden_states=False,
|
||||
return_tuple=False,
|
||||
return_dict=False,
|
||||
):
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_attentions = () if output_attentions else None
|
||||
@@ -469,7 +469,7 @@ class BertEncoder(nn.Module):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
|
||||
@@ -609,9 +609,9 @@ class BertForPretrainingOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
prediction_logits: torch.FloatTensor
|
||||
seq_relationship_logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
prediction_logits: torch.FloatTensor = None
|
||||
seq_relationship_logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -674,8 +674,9 @@ BERT_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -743,13 +744,13 @@ class BertModel(BertPreTrainedModel):
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -800,12 +801,12 @@ class BertModel(BertPreTrainedModel):
|
||||
encoder_attention_mask=encoder_extended_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.pooler(sequence_output)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
@@ -847,7 +848,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
next_sentence_label=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -872,7 +873,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
@@ -887,7 +888,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -898,7 +899,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output, pooled_output = outputs[:2]
|
||||
@@ -911,7 +912,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
||||
total_loss = masked_lm_loss + next_sentence_loss
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores, seq_relationship_score) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -955,7 +956,7 @@ class BertLMHeadModel(BertPreTrainedModel):
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -977,14 +978,14 @@ class BertLMHeadModel(BertPreTrainedModel):
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||
>>> config = BertConfig.from_pretrained("bert-base-cased")
|
||||
>>> config.is_decoder = True
|
||||
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
|
||||
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True)
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
>>> prediction_logits = outputs.logits
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -997,7 +998,7 @@ class BertLMHeadModel(BertPreTrainedModel):
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1011,7 +1012,7 @@ class BertLMHeadModel(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
return ((lm_loss,) + output) if lm_loss is not None else output
|
||||
|
||||
@@ -1065,7 +1066,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -1086,7 +1087,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task."
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -1099,7 +1100,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1110,7 +1111,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -1161,7 +1162,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
next_sentence_label=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1178,7 +1179,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
|
||||
@@ -1188,7 +1189,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
>>> logits = outputs.logits
|
||||
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -1199,7 +1200,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -1211,7 +1212,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), next_sentence_label.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (seq_relationship_scores,) + outputs[2:]
|
||||
return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
|
||||
|
||||
@@ -1257,7 +1258,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1266,7 +1267,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -1277,7 +1278,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -1295,7 +1296,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1337,7 +1338,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1345,7 +1346,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -1367,7 +1368,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -1381,7 +1382,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1424,14 +1425,14 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -1442,7 +1443,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1464,7 +1465,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1507,7 +1508,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1519,7 +1520,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -1530,7 +1531,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1557,7 +1558,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
|
||||
@@ -51,12 +51,6 @@ CAMEMBERT_START_DOCSTRING = r"""
|
||||
model. Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration.
|
||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
@@ -295,8 +295,9 @@ CTRL_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -355,7 +356,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
if "past" in kwargs:
|
||||
@@ -371,7 +372,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -472,7 +473,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
|
||||
all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
|
||||
|
||||
return BaseModelOutputWithPast(
|
||||
@@ -526,7 +527,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -544,7 +545,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
)
|
||||
past_key_values = kwargs.pop("past")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -557,7 +558,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
hidden_states = transformer_outputs[0]
|
||||
@@ -573,7 +574,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (lm_logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -279,7 +279,7 @@ class Transformer(nn.Module):
|
||||
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])
|
||||
|
||||
def forward(
|
||||
self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_tuple=None
|
||||
self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
@@ -324,7 +324,7 @@ class Transformer(nn.Module):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
|
||||
@@ -396,8 +396,9 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -444,13 +445,13 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -477,7 +478,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
|
||||
@@ -516,7 +517,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -535,7 +536,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
dlbrt_output = self.distilbert(
|
||||
input_ids=input_ids,
|
||||
@@ -544,7 +545,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
||||
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
||||
@@ -556,7 +557,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
if labels is not None:
|
||||
mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_logits,) + dlbrt_output[1:]
|
||||
return ((mlm_loss,) + output) if mlm_loss is not None else output
|
||||
|
||||
@@ -601,7 +602,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -610,7 +611,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
distilbert_output = self.distilbert(
|
||||
input_ids=input_ids,
|
||||
@@ -619,7 +620,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
||||
pooled_output = hidden_state[:, 0] # (bs, dim)
|
||||
@@ -637,7 +638,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
||||
loss_fct = nn.CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + distilbert_output[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -682,7 +683,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -694,7 +695,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
distilbert_output = self.distilbert(
|
||||
input_ids=input_ids,
|
||||
@@ -703,7 +704,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
||||
|
||||
@@ -730,7 +731,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + distilbert_output[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -775,14 +776,14 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.distilbert(
|
||||
input_ids,
|
||||
@@ -791,7 +792,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -813,7 +814,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -849,7 +850,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -865,7 +866,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
|
||||
>>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
|
||||
>>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True)
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> choice0 = "It is eaten with a fork and a knife."
|
||||
@@ -879,7 +880,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -897,7 +898,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
hidden_state = outputs[0] # (bs * num_choices, seq_len, dim)
|
||||
@@ -914,7 +915,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -134,8 +134,8 @@ class DPRReaderOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
start_logits: torch.FloatTensor
|
||||
end_logits: torch.FloatTensor
|
||||
relevance_logits: torch.FloatTensor
|
||||
end_logits: torch.FloatTensor = None
|
||||
relevance_logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -161,7 +161,7 @@ class DPREncoder(PreTrainedModel):
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
output_hidden_states: bool = False,
|
||||
return_tuple: bool = False,
|
||||
return_dict: bool = False,
|
||||
) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]:
|
||||
outputs = self.bert_model(
|
||||
input_ids=input_ids,
|
||||
@@ -170,14 +170,14 @@ class DPREncoder(PreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output, pooled_output = outputs[:2]
|
||||
pooled_output = sequence_output[:, 0, :]
|
||||
if self.projection_dim > 0:
|
||||
pooled_output = self.encode_proj(pooled_output)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + outputs[2:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
@@ -217,7 +217,7 @@ class DPRSpanPredictor(PreTrainedModel):
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
output_hidden_states: bool = False,
|
||||
return_tuple: bool = False,
|
||||
return_dict: bool = False,
|
||||
) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
|
||||
# notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length
|
||||
n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2]
|
||||
@@ -228,7 +228,7 @@ class DPRSpanPredictor(PreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = outputs[0]
|
||||
|
||||
@@ -244,7 +244,7 @@ class DPRSpanPredictor(PreTrainedModel):
|
||||
end_logits = end_logits.view(n_passages, sequence_length)
|
||||
relevance_logits = relevance_logits.view(n_passages)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (start_logits, end_logits, relevance_logits) + outputs[2:]
|
||||
|
||||
return DPRReaderOutput(
|
||||
@@ -361,6 +361,9 @@ DPR_ENCODERS_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
DPR_READER_INPUTS_DOCSTRING = r"""
|
||||
@@ -388,6 +391,9 @@ DPR_READER_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -412,7 +418,7 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
@@ -421,7 +427,7 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
|
||||
|
||||
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
|
||||
tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
|
||||
model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
|
||||
model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True)
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids).pooler_output
|
||||
"""
|
||||
@@ -430,7 +436,7 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -459,10 +465,10 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return outputs[1:]
|
||||
return DPRContextEncoderOutput(
|
||||
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
|
||||
@@ -490,7 +496,7 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
@@ -499,7 +505,7 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
|
||||
|
||||
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
|
||||
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
|
||||
model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
|
||||
model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True)
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids).pooler_output
|
||||
"""
|
||||
@@ -507,7 +513,7 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -536,10 +542,10 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return outputs[1:]
|
||||
return DPRQuestionEncoderOutput(
|
||||
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
|
||||
@@ -565,7 +571,7 @@ class DPRReader(DPRPretrainedReader):
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions: bool = None,
|
||||
output_hidden_states: bool = None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
@@ -574,7 +580,7 @@ class DPRReader(DPRPretrainedReader):
|
||||
|
||||
from transformers import DPRReader, DPRReaderTokenizer
|
||||
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
|
||||
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
|
||||
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True)
|
||||
encoded_inputs = tokenizer(
|
||||
questions=["What is love ?"],
|
||||
titles=["Haddaway"],
|
||||
@@ -591,7 +597,7 @@ class DPRReader(DPRPretrainedReader):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -613,5 +619,5 @@ class DPRReader(DPRPretrainedReader):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
@@ -208,8 +208,8 @@ class ElectraForPretrainingOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -272,8 +272,9 @@ ELECTRA_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -331,13 +332,13 @@ class ElectraModel(ElectraPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -371,7 +372,7 @@ class ElectraModel(ElectraPreTrainedModel):
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
return hidden_states
|
||||
@@ -428,7 +429,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -437,7 +438,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
discriminator_hidden_states = self.electra(
|
||||
input_ids,
|
||||
@@ -448,7 +449,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
|
||||
inputs_embeds,
|
||||
output_attentions,
|
||||
output_hidden_states,
|
||||
return_tuple,
|
||||
return_dict,
|
||||
)
|
||||
|
||||
sequence_output = discriminator_hidden_states[0]
|
||||
@@ -464,7 +465,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + discriminator_hidden_states[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -505,7 +506,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
@@ -527,7 +528,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> logits = model(input_ids).logits
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
discriminator_hidden_states = self.electra(
|
||||
input_ids,
|
||||
@@ -538,7 +539,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
|
||||
inputs_embeds,
|
||||
output_attentions,
|
||||
output_hidden_states,
|
||||
return_tuple,
|
||||
return_dict,
|
||||
)
|
||||
discriminator_sequence_output = discriminator_hidden_states[0]
|
||||
|
||||
@@ -555,7 +556,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float())
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + discriminator_hidden_states[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -606,7 +607,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -625,7 +626,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
generator_hidden_states = self.electra(
|
||||
input_ids,
|
||||
@@ -636,7 +637,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
inputs_embeds,
|
||||
output_attentions,
|
||||
output_hidden_states,
|
||||
return_tuple,
|
||||
return_dict,
|
||||
)
|
||||
generator_sequence_output = generator_hidden_states[0]
|
||||
|
||||
@@ -649,7 +650,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
loss_fct = nn.CrossEntropyLoss() # -100 index = padding token
|
||||
loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + generator_hidden_states[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -695,14 +696,14 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
discriminator_hidden_states = self.electra(
|
||||
input_ids,
|
||||
@@ -713,7 +714,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
|
||||
inputs_embeds,
|
||||
output_attentions,
|
||||
output_hidden_states,
|
||||
return_tuple,
|
||||
return_dict,
|
||||
)
|
||||
discriminator_sequence_output = discriminator_hidden_states[0]
|
||||
|
||||
@@ -732,7 +733,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + discriminator_hidden_states[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -782,7 +783,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -794,7 +795,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
discriminator_hidden_states = self.electra(
|
||||
input_ids,
|
||||
@@ -831,7 +832,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits,) + discriminator_hidden_states[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -876,7 +877,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -884,7 +885,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -905,7 +906,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = discriminator_hidden_states[0]
|
||||
@@ -919,7 +920,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + discriminator_hidden_states[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -273,7 +273,6 @@ class EncoderDecoderModel(PreTrainedModel):
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
head_mask=head_mask,
|
||||
return_tuple=True,
|
||||
**kwargs_encoder,
|
||||
)
|
||||
|
||||
@@ -288,7 +287,6 @@ class EncoderDecoderModel(PreTrainedModel):
|
||||
encoder_attention_mask=attention_mask,
|
||||
head_mask=decoder_head_mask,
|
||||
labels=labels,
|
||||
return_tuple=True,
|
||||
**kwargs_decoder,
|
||||
)
|
||||
|
||||
|
||||
@@ -110,8 +110,9 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -148,13 +149,13 @@ class FlaubertModel(XLMModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# removed: src_enc=None, src_len=None
|
||||
if input_ids is not None:
|
||||
@@ -284,7 +285,7 @@ class FlaubertModel(XLMModel):
|
||||
# move back sequence length to dimension 0
|
||||
# tensor = tensor.transpose(0, 1)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
|
||||
|
||||
return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
|
||||
|
||||
@@ -323,10 +323,10 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
lm_loss: Optional[torch.FloatTensor]
|
||||
mc_loss: Optional[torch.FloatTensor]
|
||||
lm_logits: torch.FloatTensor
|
||||
mc_logits: torch.FloatTensor
|
||||
lm_loss: Optional[torch.FloatTensor] = None
|
||||
mc_loss: Optional[torch.FloatTensor] = None
|
||||
lm_logits: torch.FloatTensor = None
|
||||
mc_logits: torch.FloatTensor = None
|
||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -395,8 +395,9 @@ GPT2_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -448,7 +449,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
if "past" in kwargs:
|
||||
@@ -464,7 +465,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -560,7 +561,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
|
||||
|
||||
return BaseModelOutputWithPast(
|
||||
@@ -616,7 +617,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -634,7 +635,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
)
|
||||
past_key_values = kwargs.pop("past")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -647,7 +648,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
hidden_states = transformer_outputs[0]
|
||||
|
||||
@@ -662,7 +663,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (lm_logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -713,7 +714,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -741,7 +742,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
>>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
||||
|
||||
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
>>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
||||
>>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
|
||||
|
||||
>>> # Add a [CLS] to the vocabulary (we should train it also!)
|
||||
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
|
||||
@@ -773,7 +774,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
)
|
||||
past_key_values = kwargs.pop("past")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -786,7 +787,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
hidden_states = transformer_outputs[0]
|
||||
@@ -805,7 +806,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (lm_logits, mc_logits) + transformer_outputs[1:]
|
||||
if mc_loss is not None:
|
||||
output = (mc_loss,) + output
|
||||
|
||||
@@ -694,7 +694,7 @@ class LongformerEncoder(nn.Module):
|
||||
attention_mask=None,
|
||||
output_attentions=False,
|
||||
output_hidden_states=False,
|
||||
return_tuple=False,
|
||||
return_dict=False,
|
||||
):
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_attentions = () if output_attentions else None
|
||||
@@ -724,7 +724,7 @@ class LongformerEncoder(nn.Module):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
|
||||
@@ -811,8 +811,9 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -942,7 +943,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
|
||||
@@ -953,7 +954,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
||||
>>> import torch
|
||||
>>> from transformers import LongformerModel, LongformerTokenizer
|
||||
|
||||
>>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
|
||||
>>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True)
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
|
||||
|
||||
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
|
||||
@@ -965,14 +966,16 @@ class LongformerModel(LongformerPreTrainedModel):
|
||||
... # classification: the <s> token
|
||||
... # QA: question tokens
|
||||
... # LM: potentially on the beginning of sentences and paragraphs
|
||||
>>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
|
||||
>>> outputs = model(input_ids, attention_mask=attention_mask)
|
||||
>>> sequence_output = outputs.last_hidden_state
|
||||
>>> pooled_output = outputs.pooler_output
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -1016,7 +1019,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
||||
attention_mask=extended_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.pooler(sequence_output)
|
||||
@@ -1026,7 +1029,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
||||
# unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1)
|
||||
sequence_output = sequence_output[:, :-padding_len]
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
@@ -1063,7 +1066,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -1082,7 +1085,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
>>> import torch
|
||||
>>> from transformers import LongformerForMaskedLM, LongformerTokenizer
|
||||
|
||||
>>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
|
||||
>>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True)
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
|
||||
|
||||
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
|
||||
@@ -1102,7 +1105,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.longformer(
|
||||
input_ids,
|
||||
@@ -1113,7 +1116,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.lm_head(sequence_output)
|
||||
@@ -1123,7 +1126,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -1171,7 +1174,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1180,7 +1183,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if global_attention_mask is None:
|
||||
logger.info("Initializing global attention on CLS token...")
|
||||
@@ -1197,7 +1200,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = outputs[0]
|
||||
logits = self.classifier(sequence_output)
|
||||
@@ -1212,7 +1215,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1272,7 +1275,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1291,7 +1294,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
|
||||
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
|
||||
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True)
|
||||
|
||||
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
>>> encoding = tokenizer(question, text, return_tensors="pt")
|
||||
@@ -1310,7 +1313,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
|
||||
>>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
|
||||
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# set global attention on question tokens
|
||||
if global_attention_mask is None:
|
||||
@@ -1327,7 +1330,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1354,7 +1357,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -1404,14 +1407,14 @@ class LongformerForTokenClassification(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.longformer(
|
||||
input_ids,
|
||||
@@ -1422,7 +1425,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1444,7 +1447,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1489,7 +1492,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1498,7 +1501,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# set global attention on question tokens
|
||||
if global_attention_mask is None:
|
||||
@@ -1536,7 +1539,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
|
||||
inputs_embeds=flat_inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
pooled_output = outputs[1]
|
||||
|
||||
@@ -1549,7 +1552,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss, MSELoss
|
||||
|
||||
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable, replace_return_docstrings
|
||||
from .modeling_outputs import BaseModelOutputWithPooling
|
||||
from .modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
|
||||
from .modeling_utils import ModuleUtilsMixin
|
||||
|
||||
|
||||
@@ -148,8 +148,9 @@ MMBT_INPUTS_DOCSTRING = r""" Inputs:
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -182,7 +183,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
Returns:
|
||||
@@ -198,7 +199,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -257,13 +258,13 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
|
||||
encoder_attention_mask=encoder_extended_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.transformer.pooler(sequence_output)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
@@ -339,7 +340,9 @@ class MMBTForClassification(nn.Module):
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
return_dict=None,
|
||||
):
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mmbt(
|
||||
input_modal=input_modal,
|
||||
@@ -353,6 +356,7 @@ class MMBTForClassification(nn.Module):
|
||||
modal_position_ids=modal_position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -360,8 +364,7 @@ class MMBTForClassification(nn.Module):
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.num_labels == 1:
|
||||
# We are doing regression
|
||||
@@ -370,6 +373,11 @@ class MMBTForClassification(nn.Module):
|
||||
else:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
return outputs # (loss), logits, (hidden_states), (attentions)
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
@@ -550,7 +550,7 @@ class MobileBertEncoder(nn.Module):
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=False,
|
||||
output_hidden_states=False,
|
||||
return_tuple=False,
|
||||
return_dict=False,
|
||||
):
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_attentions = () if output_attentions else None
|
||||
@@ -575,7 +575,7 @@ class MobileBertEncoder(nn.Module):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
|
||||
@@ -708,9 +708,9 @@ class MobileBertForPretrainingOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
prediction_logits: torch.FloatTensor
|
||||
seq_relationship_logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
prediction_logits: torch.FloatTensor = None
|
||||
seq_relationship_logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -773,8 +773,9 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -831,13 +832,13 @@ class MobileBertModel(MobileBertPreTrainedModel):
|
||||
encoder_attention_mask=None,
|
||||
output_hidden_states=None,
|
||||
output_attentions=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -890,12 +891,12 @@ class MobileBertModel(MobileBertPreTrainedModel):
|
||||
encoder_attention_mask=encoder_extended_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.pooler(sequence_output)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
@@ -958,7 +959,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
||||
next_sentence_label=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
@@ -979,7 +980,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
|
||||
>>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
|
||||
>>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True)
|
||||
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
@@ -988,7 +989,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
||||
>>> seq_relationship_logits = outputs.seq_relationship_logits
|
||||
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mobilebert(
|
||||
input_ids,
|
||||
@@ -999,7 +1000,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output, pooled_output = outputs[:2]
|
||||
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
|
||||
@@ -1011,7 +1012,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
||||
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
||||
total_loss = masked_lm_loss + next_sentence_loss
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores, seq_relationship_score) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -1079,7 +1080,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
|
||||
encoder_attention_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -1097,7 +1098,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
|
||||
FutureWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mobilebert(
|
||||
input_ids,
|
||||
@@ -1110,7 +1111,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1121,7 +1122,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -1169,7 +1170,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
|
||||
next_sentence_label=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1186,7 +1187,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
|
||||
>>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
|
||||
>>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True)
|
||||
|
||||
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
|
||||
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
|
||||
@@ -1196,7 +1197,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mobilebert(
|
||||
input_ids,
|
||||
@@ -1207,7 +1208,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -1218,7 +1219,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (seq_relationship_score,) + outputs[2:]
|
||||
return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
|
||||
|
||||
@@ -1263,7 +1264,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1272,7 +1273,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mobilebert(
|
||||
input_ids,
|
||||
@@ -1283,7 +1284,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
pooled_output = outputs[1]
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
@@ -1299,7 +1300,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1342,7 +1343,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1354,7 +1355,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mobilebert(
|
||||
input_ids,
|
||||
@@ -1365,7 +1366,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1392,7 +1393,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -1438,7 +1439,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1446,7 +1447,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -1468,7 +1469,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -1482,7 +1483,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1525,14 +1526,14 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.mobilebert(
|
||||
input_ids,
|
||||
@@ -1543,7 +1544,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1565,7 +1566,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -315,10 +315,10 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
lm_loss: Optional[torch.FloatTensor]
|
||||
mc_loss: Optional[torch.FloatTensor]
|
||||
lm_logits: torch.FloatTensor
|
||||
mc_logits: torch.FloatTensor
|
||||
lm_loss: Optional[torch.FloatTensor] = None
|
||||
mc_loss: Optional[torch.FloatTensor] = None
|
||||
lm_logits: torch.FloatTensor = None
|
||||
mc_logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -374,8 +374,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -425,13 +426,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -496,7 +497,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
|
||||
|
||||
return BaseModelOutput(
|
||||
@@ -538,7 +539,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -548,7 +549,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -559,7 +560,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
hidden_states = transformer_outputs[0]
|
||||
lm_logits = self.lm_head(hidden_states)
|
||||
@@ -573,7 +574,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (lm_logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -622,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
mc_labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -650,7 +651,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
import torch
|
||||
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
|
||||
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
@@ -662,7 +663,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
lm_logits = outputs.lm_logits
|
||||
mc_logits = outputs.mc_logits
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
if "lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
@@ -680,7 +681,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
hidden_states = transformer_outputs[0]
|
||||
|
||||
@@ -698,7 +699,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
mc_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (lm_logits, mc_logits) + transformer_outputs[1:]
|
||||
if mc_loss is not None:
|
||||
output = (mc_loss,) + output
|
||||
|
||||
@@ -63,7 +63,7 @@ class BaseModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
|
||||
last_hidden_state: torch.FloatTensor
|
||||
pooler_output: torch.FloatTensor
|
||||
pooler_output: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -179,7 +179,7 @@ class CausalLMOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -213,8 +213,8 @@ class CausalLMOutputWithPast(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -243,8 +243,8 @@ class MaskedLMOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -291,8 +291,8 @@ class Seq2SeqLMOutput(ModelOutput):
|
||||
self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -324,8 +324,8 @@ class NextSentencePredictorOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -353,8 +353,8 @@ class SequenceClassifierOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -401,8 +401,8 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput):
|
||||
self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -436,8 +436,8 @@ class MultipleChoiceModelOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -465,8 +465,8 @@ class TokenClassifierOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -496,9 +496,9 @@ class QuestionAnsweringModelOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
start_logits: torch.FloatTensor
|
||||
end_logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
start_logits: torch.FloatTensor = None
|
||||
end_logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -547,9 +547,9 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
|
||||
self-attention heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
start_logits: torch.FloatTensor
|
||||
end_logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
start_logits: torch.FloatTensor = None
|
||||
end_logits: torch.FloatTensor = None
|
||||
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
|
||||
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -39,13 +39,7 @@ from .file_utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_callable,
|
||||
)
|
||||
from .modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
CausalLMOutput,
|
||||
MaskedLMOutput,
|
||||
QuestionAnsweringModelOutput,
|
||||
SequenceClassifierOutput,
|
||||
)
|
||||
from .modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput
|
||||
from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
|
||||
|
||||
|
||||
@@ -1851,8 +1845,8 @@ class ReformerModelWithLMHeadOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
past_buckets_states: Optional[List[Tuple[torch.LongTensor, torch.FloatTensor]]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -1922,8 +1916,9 @@ REFORMER_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -1962,7 +1957,7 @@ class ReformerModel(ReformerPreTrainedModel):
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint="google/reformer-crime-and-punishment",
|
||||
output_type=BaseModelOutput,
|
||||
output_type=ReformerModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
@@ -1977,40 +1972,14 @@ class ReformerModel(ReformerPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_hidden_states=None,
|
||||
output_attentions=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
|
||||
List of :obj:`tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with :obj:`tuple(0)` being the previous `buckets` of shape
|
||||
:obj:`(batch_size, num_heads, num_hashes, sequence_length)`)
|
||||
and :obj:`tuple(1)` being the previous `hidden_states` of shape
|
||||
:obj:`(batch_size, sequence_length, hidden_size)`).
|
||||
|
||||
Contains pre-computed buckets and hidden-states that can be used (see
|
||||
``past_buckets_states`` input) to speed up sequential decoding.
|
||||
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -2102,7 +2071,7 @@ class ReformerModel(ReformerPreTrainedModel):
|
||||
hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None
|
||||
attentions = encoder_outputs.all_attentions if output_attentions else None
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [sequence_output, past_buckets_states, hidden_states, attentions] if v is not None)
|
||||
return ReformerModelOutput(
|
||||
last_hidden_state=sequence_output,
|
||||
@@ -2208,7 +2177,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_hidden_states=None,
|
||||
output_attentions=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
labels=None,
|
||||
):
|
||||
r"""
|
||||
@@ -2218,7 +2187,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
reformer_outputs = self.reformer(
|
||||
input_ids,
|
||||
@@ -2231,7 +2200,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_hidden_states=output_hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = reformer_outputs[0]
|
||||
@@ -2246,7 +2215,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + reformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -2326,7 +2295,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
|
||||
labels=None,
|
||||
output_hidden_states=None,
|
||||
output_attentions=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -2334,7 +2303,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
reformer_outputs = self.reformer(
|
||||
input_ids,
|
||||
@@ -2346,7 +2315,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
|
||||
use_cache=False, # no causal mask
|
||||
output_hidden_states=output_hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = reformer_outputs[0]
|
||||
@@ -2357,7 +2326,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + reformer_outputs[1:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -2408,7 +2377,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
|
||||
labels=None,
|
||||
output_hidden_states=None,
|
||||
output_attentions=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -2427,7 +2396,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
|
||||
num_hashes=num_hashes,
|
||||
output_hidden_states=output_hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -2443,7 +2412,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -2511,7 +2480,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_hidden_states=None,
|
||||
output_attentions=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -2523,7 +2492,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
reformer_outputs = self.reformer(
|
||||
input_ids,
|
||||
@@ -2535,7 +2504,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
|
||||
use_cache=False, # no causal mask
|
||||
output_hidden_states=output_hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = reformer_outputs[0]
|
||||
@@ -2562,7 +2531,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + reformer_outputs[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
|
||||
@@ -143,8 +143,9 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -208,7 +209,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
@@ -227,7 +228,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.roberta(
|
||||
input_ids,
|
||||
@@ -238,7 +239,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.lm_head(sequence_output)
|
||||
@@ -248,7 +249,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -321,7 +322,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -330,7 +331,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.roberta(
|
||||
input_ids,
|
||||
@@ -341,7 +342,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = outputs[0]
|
||||
logits = self.classifier(sequence_output)
|
||||
@@ -356,7 +357,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -401,7 +402,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -409,7 +410,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -431,7 +432,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
||||
inputs_embeds=flat_inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
pooled_output = outputs[1]
|
||||
|
||||
@@ -444,7 +445,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -490,14 +491,14 @@ class RobertaForTokenClassification(BertPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.roberta(
|
||||
input_ids,
|
||||
@@ -508,7 +509,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -530,7 +531,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -595,7 +596,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -607,7 +608,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.roberta(
|
||||
input_ids,
|
||||
@@ -618,7 +619,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -645,7 +646,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
|
||||
@@ -675,7 +675,7 @@ class T5Stack(T5PreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
@@ -683,7 +683,7 @@ class T5Stack(T5PreTrainedModel):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -787,7 +787,7 @@ class T5Stack(T5PreTrainedModel):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(
|
||||
v
|
||||
for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions]
|
||||
@@ -868,8 +868,9 @@ T5_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -930,7 +931,7 @@ class T5Model(T5PreTrainedModel):
|
||||
head_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -957,7 +958,7 @@ class T5Model(T5PreTrainedModel):
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# Encode if needed (training, first prediction pass)
|
||||
if encoder_outputs is None:
|
||||
@@ -968,9 +969,9 @@ class T5Model(T5PreTrainedModel):
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
|
||||
@@ -1005,11 +1006,11 @@ class T5Model(T5PreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
if past is not None:
|
||||
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
|
||||
return decoder_outputs + encoder_outputs
|
||||
@@ -1081,7 +1082,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
head_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@@ -1100,13 +1101,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
|
||||
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
|
||||
>>> outputs = model(input_ids=input_ids, labels=input_ids)
|
||||
>>> loss, prediction_scores = outputs[:2]
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
|
||||
>>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
|
||||
>>> outputs = model.generate(input_ids)
|
||||
"""
|
||||
@@ -1126,7 +1128,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# Encode if needed (training, first prediction pass)
|
||||
if encoder_outputs is None:
|
||||
@@ -1138,9 +1140,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
|
||||
@@ -1174,7 +1176,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = decoder_outputs[0]
|
||||
@@ -1190,7 +1192,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
|
||||
|
||||
past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
if past is not None:
|
||||
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
|
||||
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
|
||||
|
||||
@@ -618,7 +618,7 @@ class TransfoXLModelOutput(ModelOutput):
|
||||
"""
|
||||
|
||||
last_hidden_state: torch.FloatTensor
|
||||
mems: List[torch.FloatTensor]
|
||||
mems: List[torch.FloatTensor] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -650,9 +650,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
losses: Optional[torch.FloatTensor]
|
||||
prediction_scores: torch.FloatTensor
|
||||
mems: List[torch.FloatTensor]
|
||||
losses: Optional[torch.FloatTensor] = None
|
||||
prediction_scores: torch.FloatTensor = None
|
||||
mems: List[torch.FloatTensor] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@@ -695,8 +695,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -836,13 +837,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
|
||||
# so we transpose here from shape [bsz, len] to shape [len, bsz]
|
||||
@@ -941,7 +942,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
# We transpose back here to shape [bsz, len, hidden_dim]
|
||||
core_out = core_out.transpose(0, 1).contiguous()
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
|
||||
|
||||
return TransfoXLModelOutput(
|
||||
@@ -1013,7 +1014,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1023,7 +1024,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
if input_ids is not None:
|
||||
bsz, tgt_len = input_ids.size(0), input_ids.size(1)
|
||||
elif inputs_embeds is not None:
|
||||
@@ -1038,7 +1039,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
last_hidden = transformer_outputs[0]
|
||||
@@ -1048,7 +1049,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||
prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()
|
||||
loss = softmax_output.view(bsz, tgt_len - 1) if labels is not None else None
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -1167,7 +1167,7 @@ class SQuADHead(nn.Module):
|
||||
cls_index: Optional[torch.LongTensor] = None,
|
||||
is_impossible: Optional[torch.LongTensor] = None,
|
||||
p_mask: Optional[torch.FloatTensor] = None,
|
||||
return_tuple: bool = False,
|
||||
return_dict: bool = False,
|
||||
) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
|
||||
"""
|
||||
Args:
|
||||
@@ -1184,8 +1184,8 @@ class SQuADHead(nn.Module):
|
||||
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
|
||||
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
|
||||
1.0 means token should be masked.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to return a plain tuple instead of a :class:`~transformers.file_utils.ModelOuput`.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
"""
|
||||
@@ -1214,7 +1214,7 @@ class SQuADHead(nn.Module):
|
||||
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
|
||||
total_loss += cls_loss * 0.5
|
||||
|
||||
return (total_loss,) if return_tuple else SquadHeadOutput(loss=total_loss)
|
||||
return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
|
||||
|
||||
else:
|
||||
# during inference, compute the end logits based on beam search
|
||||
@@ -1244,7 +1244,7 @@ class SQuADHead(nn.Module):
|
||||
start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
|
||||
cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
|
||||
else:
|
||||
return SquadHeadOutput(
|
||||
|
||||
@@ -367,8 +367,9 @@ XLM_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -482,13 +483,13 @@ class XLMModel(XLMPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None:
|
||||
bs, slen = input_ids.size()
|
||||
@@ -595,7 +596,7 @@ class XLMModel(XLMPreTrainedModel):
|
||||
# move back sequence length to dimension 0
|
||||
# tensor = tensor.transpose(0, 1)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
|
||||
return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
|
||||
|
||||
@@ -693,7 +694,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -703,7 +704,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -717,13 +718,13 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
output = transformer_outputs[0]
|
||||
outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided.
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return outputs + transformer_outputs[1:]
|
||||
|
||||
return MaskedLMOutput(
|
||||
@@ -770,7 +771,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -779,7 +780,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -793,7 +794,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
output = transformer_outputs[0]
|
||||
@@ -809,7 +810,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -857,7 +858,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -869,7 +870,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -883,7 +884,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = transformer_outputs[0]
|
||||
@@ -910,7 +911,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + transformer_outputs[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -957,7 +958,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
||||
p_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -984,7 +985,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
|
||||
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> start_positions = torch.tensor([1])
|
||||
@@ -993,7 +994,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
||||
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
>>> loss = outputs.loss
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -1007,7 +1008,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
output = transformer_outputs[0]
|
||||
@@ -1019,10 +1020,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
||||
cls_index=cls_index,
|
||||
is_impossible=is_impossible,
|
||||
p_mask=p_mask,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return outputs + transformer_outputs[1:]
|
||||
|
||||
return XLMForQuestionAnsweringOutput(
|
||||
@@ -1074,14 +1075,14 @@ class XLMForTokenClassification(XLMPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -1095,7 +1096,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1117,7 +1118,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1162,7 +1163,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1170,7 +1171,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -1204,7 +1205,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
output = transformer_outputs[0]
|
||||
logits = self.sequence_summary(output)
|
||||
@@ -1216,7 +1217,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
|
||||
@@ -53,12 +53,6 @@ XLM_ROBERTA_START_DOCSTRING = r"""
|
||||
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
|
||||
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
@@ -627,8 +627,8 @@ class XLNetLMHeadModelOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
mems: Optional[List[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -661,8 +661,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
mems: Optional[List[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -695,8 +695,8 @@ class XLNetForTokenClassificationOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
mems: Optional[List[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -731,8 +731,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: torch.FloatTensor = None
|
||||
mems: Optional[List[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -767,9 +767,9 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor]
|
||||
start_logits: torch.FloatTensor
|
||||
end_logits: torch.FloatTensor
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
start_logits: torch.FloatTensor = None
|
||||
end_logits: torch.FloatTensor = None
|
||||
mems: Optional[List[torch.FloatTensor]] = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@@ -891,8 +891,9 @@ XLNET_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -1051,13 +1052,13 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
|
||||
|
||||
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
|
||||
@@ -1239,7 +1240,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
else:
|
||||
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
|
||||
|
||||
return XLNetModelOutput(
|
||||
@@ -1325,7 +1326,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1344,7 +1345,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
import torch
|
||||
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
|
||||
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
|
||||
|
||||
# We show how to setup inputs to predict a next token using a bi-directional context.
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
|
||||
@@ -1369,7 +1370,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
loss = outputs.loss
|
||||
next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
@@ -1385,7 +1386,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
logits = self.lm_loss(transformer_outputs[0])
|
||||
@@ -1396,7 +1397,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1447,7 +1448,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`)
|
||||
@@ -1456,7 +1457,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
@@ -1472,7 +1473,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
output = transformer_outputs[0]
|
||||
|
||||
@@ -1489,7 +1490,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1539,7 +1540,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1547,7 +1548,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
|
||||
|
||||
outputs = self.transformer(
|
||||
@@ -1563,7 +1564,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1584,7 +1585,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1634,7 +1635,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1642,7 +1643,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
@@ -1669,7 +1670,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
output = transformer_outputs[0]
|
||||
@@ -1683,7 +1684,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -1734,7 +1735,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1746,7 +1747,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
|
||||
|
||||
outputs = self.transformer(
|
||||
@@ -1762,7 +1763,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -1789,7 +1790,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[1:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
@@ -1842,7 +1843,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -1869,7 +1870,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
|
||||
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
|
||||
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
|
||||
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
>>> start_positions = torch.tensor([1])
|
||||
@@ -1878,7 +1879,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
|
||||
>>> loss = outputs.loss
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
@@ -1894,7 +1895,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
hidden_states = transformer_outputs[0]
|
||||
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
|
||||
@@ -1924,7 +1925,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
|
||||
total_loss += cls_loss * 0.5
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (total_loss,) + transformer_outputs[1:]
|
||||
else:
|
||||
return XLNetForQuestionAnsweringOutput(
|
||||
@@ -1966,7 +1967,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
hidden_states, start_states=start_states, cls_index=cls_index
|
||||
) # Shape (batch size,): one single `cls_logits` for each sample
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
|
||||
return outputs + transformer_outputs[1:]
|
||||
else:
|
||||
|
||||
@@ -2122,6 +2122,6 @@ def pipeline(
|
||||
"Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
|
||||
"Trying to load the model with Tensorflow."
|
||||
)
|
||||
model = model_class.from_pretrained(model, config=config, return_tuple=True, **model_kwargs)
|
||||
model = model_class.from_pretrained(model, config=config, **model_kwargs)
|
||||
|
||||
return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)
|
||||
|
||||
@@ -661,9 +661,7 @@ class Trainer:
|
||||
|
||||
if self.args.past_index >= 0 and self._past is not None:
|
||||
inputs["mems"] = self._past
|
||||
# Our model outputs do not work with DataParallel, so forcing return tuple.
|
||||
if isinstance(model, nn.DataParallel):
|
||||
inputs["return_tuple"] = True
|
||||
|
||||
return inputs
|
||||
|
||||
def training_step(
|
||||
|
||||
@@ -260,8 +260,9 @@ XXX_INPUTS_DOCSTRING = r"""
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
|
||||
plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@@ -310,13 +311,13 @@ class XxxModel(XxxPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -351,7 +352,7 @@ class XxxModel(XxxPreTrainedModel):
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.pooler(sequence_output)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
@@ -393,7 +394,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -402,7 +403,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -413,7 +414,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -424,7 +425,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
@@ -470,7 +471,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -479,7 +480,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
|
||||
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -490,7 +491,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -508,7 +509,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -550,7 +551,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -558,7 +559,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
|
||||
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
@@ -580,7 +581,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
@@ -594,7 +595,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (reshaped_logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -637,14 +638,14 @@ class XxxForTokenClassification(XxxPreTrainedModel):
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the token classification loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -655,7 +656,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -677,7 +678,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
@@ -720,7 +721,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
|
||||
end_positions=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -732,7 +733,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
"""
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.transformer(
|
||||
input_ids,
|
||||
@@ -743,7 +744,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
@@ -770,7 +771,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if return_tuple:
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[2:]
|
||||
return ((total_loss,) + output) if total_loss is not None else output
|
||||
|
||||
|
||||
@@ -74,6 +74,7 @@ class ModelTesterMixin:
|
||||
|
||||
def test_save_load(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
@@ -803,8 +804,6 @@ class ModelTesterMixin:
|
||||
|
||||
# Wrap model in nn.DataParallel
|
||||
model = torch.nn.DataParallel(model)
|
||||
# Our model outputs do not work with DataParallel, so forcing return tuple.
|
||||
inputs_dict["return_tuple"] = True
|
||||
with torch.no_grad():
|
||||
_ = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
|
||||
@@ -329,7 +329,6 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
import tempfile
|
||||
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config_and_inputs[0].return_tuple = True
|
||||
model = T5Model(config_and_inputs[0]).to(torch_device)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
torch.onnx.export(
|
||||
|
||||
Reference in New Issue
Block a user