PyTorch CTRL + Style

This commit is contained in:
Lysandre
2020-01-20 16:17:54 -05:00
committed by Lysandre Debut
parent 980211a63a
commit 7511f3dd89
23 changed files with 141 additions and 209 deletions

View File

@@ -1,6 +1,14 @@
CTRL CTRL
---------------------------------------------------- ----------------------------------------------------
CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior.
Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl), Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl),
you'll be able to convert from TF to our HuggingFace/Transformers format using the you'll be able to convert from TF to our HuggingFace/Transformers format using the
``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_). ``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_).

View File

@@ -38,5 +38,6 @@ class XLMRobertaConfig(RobertaConfig):
This class overrides :class:`~transformers.RobertaConfig`. Please check the This class overrides :class:`~transformers.RobertaConfig`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "xlm-roberta" model_type = "xlm-roberta"

View File

@@ -607,7 +607,6 @@ class AlbertMLMHead(nn.Module):
"Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
) )
class AlbertForMaskedLM(AlbertPreTrainedModel): class AlbertForMaskedLM(AlbertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -698,7 +697,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class AlbertForSequenceClassification(AlbertPreTrainedModel): class AlbertForSequenceClassification(AlbertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -794,7 +792,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class AlbertForQuestionAnswering(AlbertPreTrainedModel): class AlbertForQuestionAnswering(AlbertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels

View File

@@ -813,7 +813,6 @@ class BertModel(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForPreTraining(BertPreTrainedModel): class BertForPreTraining(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -907,11 +906,8 @@ class BertForPreTraining(BertPreTrainedModel):
return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
"""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
)
class BertForMaskedLM(BertPreTrainedModel): class BertForMaskedLM(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -1018,11 +1014,9 @@ class BertForMaskedLM(BertPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """, """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
BERT_START_DOCSTRING,
) )
class BertForNextSentencePrediction(BertPreTrainedModel): class BertForNextSentencePrediction(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -1105,7 +1099,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForSequenceClassification(BertPreTrainedModel): class BertForSequenceClassification(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -1198,7 +1191,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForMultipleChoice(BertPreTrainedModel): class BertForMultipleChoice(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -1294,7 +1286,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForTokenClassification(BertPreTrainedModel): class BertForTokenClassification(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -1386,7 +1377,6 @@ class BertForTokenClassification(BertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class BertForQuestionAnswering(BertPreTrainedModel): class BertForQuestionAnswering(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(BertForQuestionAnswering, self).__init__(config) super(BertForQuestionAnswering, self).__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels

View File

@@ -58,19 +58,20 @@ class CamembertModel(RobertaModel):
This class overrides :class:`~transformers.RobertaModel`. Please check the This class overrides :class:`~transformers.RobertaModel`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings( @add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top. """, """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
CAMEMBERT_START_DOCSTRING,
) )
class CamembertForMaskedLM(RobertaForMaskedLM): class CamembertForMaskedLM(RobertaForMaskedLM):
""" """
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -85,6 +86,7 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -99,9 +101,11 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings( @add_start_docstrings(
"""CamemBERT Model with a token classification head on top (a linear layer on top of """CamemBERT Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
@@ -112,6 +116,6 @@ class CamembertForTokenClassification(RobertaForTokenClassification):
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = CamembertConfig config_class = CamembertConfig
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP

View File

@@ -24,7 +24,7 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from .configuration_ctrl import CTRLConfig from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import Conv1D, PreTrainedModel from .modeling_utils import Conv1D, PreTrainedModel
@@ -184,57 +184,53 @@ class CTRLPreTrainedModel(PreTrainedModel):
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
CTRL_START_DOCSTRING = r""" CTRL model was proposed in CTRL_START_DOCSTRING = r"""
`CTRL: A Conditional Transformer Language Model for Controllable Generation`_
by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior. refer to the PyTorch documentation for all matter related to general usage and behavior.
.. _`CTRL: A Conditional Transformer Language Model for Controllable Generation`:
https://www.github.com/salesforce/ctrl
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Parameters: Parameters:
config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
""" """
CTRL_INPUTS_DOCSTRING = r""" Inputs: CTRL_INPUTS_DOCSTRING = r"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`transformers.CTRLTokenizer`. Indices can be obtained using :class:`transformers.CTRLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
**past**:
list of ``torch.FloatTensor`` (one for each layer): `What are input IDs? <../glossary.html#input-ids>`__
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed. should not be passed as input ids as they have already been computed.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs). `What are attention masks? <../glossary.html#attention-mask>`__
The embeddings from these tokens will be summed with the respective token embeddings. token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). Segment token indices to indicate first and second portions of the inputs.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
`What are token type IDs? <../glossary.html#token-type-ids>`_
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Indices of positions of each input sequence tokens in the position embeddings. Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``. Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
`What are position IDs? <../glossary.html#position-ids>`_
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
""" """
@@ -243,35 +239,8 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
@add_start_docstrings( @add_start_docstrings(
"The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING, CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
) )
class CTRLModel(CTRLPreTrainedModel): class CTRLModel(CTRLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -310,6 +279,7 @@ class CTRLModel(CTRLPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads) self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
@@ -320,6 +290,36 @@ class CTRLModel(CTRLPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
): ):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`CTRLConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None: elif input_ids is not None:
@@ -435,50 +435,10 @@ class CTRLModel(CTRLPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""The CTRL Model transformer with a language modeling head on top """The CTRL Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
CTRL_START_DOCSTRING, CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
) )
class CTRLLMHeadModel(CTRLPreTrainedModel): class CTRLLMHeadModel(CTRLPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -499,6 +459,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
inputs.update(kwargs) inputs.update(kwargs)
return inputs return inputs
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
@@ -510,6 +471,49 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
): ):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`~transformers.CTRLConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
Language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
past=past, past=past,

View File

@@ -390,7 +390,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertModel(DistilBertPreTrainedModel): class DistilBertModel(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -484,11 +483,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """, """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
DISTILBERT_START_DOCSTRING,
) )
class DistilBertForMaskedLM(DistilBertPreTrainedModel): class DistilBertForMaskedLM(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
@@ -567,7 +564,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertForSequenceClassification(DistilBertPreTrainedModel): class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -645,7 +641,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -745,7 +740,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class DistilBertForTokenClassification(DistilBertPreTrainedModel): class DistilBertForTokenClassification(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels

View File

@@ -269,12 +269,6 @@ GPT2_START_DOCSTRING = r"""
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior. refer to the PyTorch documentation for all matter related to general usage and behavior.
.. _`Language Models are Unsupervised Multitask Learners`:
https://openai.com/blog/better-language-models/
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Parameters: Parameters:
config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model. config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the configuration.
@@ -328,7 +322,6 @@ GPT2_INPUTS_DOCSTRING = r"""
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class GPT2Model(GPT2PreTrainedModel): class GPT2Model(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
@@ -514,7 +507,6 @@ class GPT2Model(GPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class GPT2LMHeadModel(GPT2PreTrainedModel): class GPT2LMHeadModel(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
@@ -624,7 +616,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class GPT2DoubleHeadsModel(GPT2PreTrainedModel): class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
config.num_labels = 1 config.num_labels = 1

View File

@@ -338,7 +338,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class OpenAIGPTModel(OpenAIGPTPreTrainedModel): class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
@@ -493,7 +492,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = OpenAIGPTModel(config) self.transformer = OpenAIGPTModel(config)
@@ -587,7 +585,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)

View File

@@ -150,6 +150,7 @@ class RobertaModel(BertModel):
This class overrides :class:`~transformers.BertModel`. Please check the This class overrides :class:`~transformers.BertModel`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = RobertaConfig config_class = RobertaConfig
pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "roberta" base_model_prefix = "roberta"
@@ -167,9 +168,7 @@ class RobertaModel(BertModel):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings = value
@add_start_docstrings( @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
"""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
)
class RobertaForMaskedLM(BertPreTrainedModel): class RobertaForMaskedLM(BertPreTrainedModel):
config_class = RobertaConfig config_class = RobertaConfig
pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -652,7 +651,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds inputs_embeds=inputs_embeds,
) )
sequence_output = outputs[0] sequence_output = outputs[0]

View File

@@ -560,7 +560,6 @@ ALBERT_INPUTS_DOCSTRING = r"""
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertModel(TFAlbertPreTrainedModel): class TFAlbertModel(TFAlbertPreTrainedModel):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
@@ -705,11 +704,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
return outputs return outputs
@add_start_docstrings( @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
"""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING
)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs) super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
@@ -766,7 +762,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs) super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels

View File

@@ -660,7 +660,6 @@ BERT_INPUTS_DOCSTRING = r"""
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertModel(TFBertPreTrainedModel): class TFBertModel(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
@@ -711,7 +710,6 @@ class TFBertModel(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForPreTraining(TFBertPreTrainedModel): class TFBertForPreTraining(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
@@ -767,11 +765,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
"""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
)
class TFBertForMaskedLM(TFBertPreTrainedModel): class TFBertForMaskedLM(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
@@ -822,11 +817,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """, """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
BERT_START_DOCSTRING,
) )
class TFBertForNextSentencePrediction(TFBertPreTrainedModel): class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
@@ -879,7 +872,6 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForSequenceClassification(TFBertPreTrainedModel): class TFBertForSequenceClassification(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -938,7 +930,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForMultipleChoice(TFBertPreTrainedModel): class TFBertForMultipleChoice(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
@@ -1049,7 +1040,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForTokenClassification(TFBertPreTrainedModel): class TFBertForTokenClassification(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -1108,7 +1098,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForQuestionAnswering(TFBertPreTrainedModel): class TFBertForQuestionAnswering(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels

View File

@@ -536,7 +536,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertModel(TFDistilBertPreTrainedModel): class TFDistilBertModel(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
@@ -594,11 +593,9 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
@add_start_docstrings( @add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """, """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
@@ -665,7 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -730,7 +726,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -786,7 +781,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)

View File

@@ -444,7 +444,6 @@ GPT2_INPUTS_DOCSTRING = r"""
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2Model(TFGPT2PreTrainedModel): class TFGPT2Model(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
@@ -494,7 +493,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
@@ -557,7 +555,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
config.num_labels = 1 config.num_labels = 1

View File

@@ -427,7 +427,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@@ -473,7 +472,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@@ -531,7 +529,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
config.num_labels = 1 config.num_labels = 1

View File

@@ -180,7 +180,6 @@ ROBERTA_INPUTS_DOCSTRING = r"""
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaModel(TFRobertaPreTrainedModel): class TFRobertaModel(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
@@ -256,11 +255,8 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
return x return x
@add_start_docstrings( @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
"""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
)
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
@@ -340,7 +336,6 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -394,7 +389,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels

View File

@@ -687,7 +687,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TFTransfoXLModel(TFTransfoXLPreTrainedModel): class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@@ -737,7 +736,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")

View File

@@ -571,7 +571,6 @@ XLM_INPUTS_DOCSTRING = r"""
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMModel(TFXLMPreTrainedModel): class TFXLMModel(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
@@ -650,7 +649,6 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
@@ -705,7 +703,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMForSequenceClassification(TFXLMPreTrainedModel): class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -760,7 +757,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")

View File

@@ -780,7 +780,6 @@ XLNET_INPUTS_DOCSTRING = r"""
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetModel(TFXLNetPreTrainedModel): class TFXLNetModel(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -830,7 +829,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -896,7 +894,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -961,7 +958,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -1015,11 +1011,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
return outputs # return logits, (mems), (hidden states), (attentions) return outputs # return logits, (mems), (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of @add_start_docstrings(
"""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING) XLNET_START_DOCSTRING,
)
class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")

View File

@@ -550,7 +550,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TransfoXLModel(TransfoXLPreTrainedModel): class TransfoXLModel(TransfoXLPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
@@ -803,7 +802,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = TransfoXLModel(config) self.transformer = TransfoXLModel(config)

View File

@@ -317,7 +317,6 @@ XLM_INPUTS_DOCSTRING = r"""
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMModel(XLMPreTrainedModel): class XLMModel(XLMPreTrainedModel):
def __init__(self, config): # , dico, is_encoder, with_output): def __init__(self, config): # , dico, is_encoder, with_output):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
@@ -620,7 +619,6 @@ class XLMPredLayer(nn.Module):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMWithLMHeadModel(XLMPreTrainedModel): class XLMWithLMHeadModel(XLMPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = XLMModel(config) self.transformer = XLMModel(config)
@@ -718,7 +716,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForSequenceClassification(XLMPreTrainedModel): class XLMForSequenceClassification(XLMPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -813,7 +810,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -929,7 +925,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class XLMForQuestionAnswering(XLMPreTrainedModel): class XLMForQuestionAnswering(XLMPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)

View File

@@ -61,19 +61,20 @@ class XLMRobertaModel(RobertaModel):
This class overrides :class:`~transformers.RobertaModel`. Please check the This class overrides :class:`~transformers.RobertaModel`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings( @add_start_docstrings(
"""XLM-RoBERTa Model with a `language modeling` head on top. """, """XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
XLM_ROBERTA_START_DOCSTRING,
) )
class XLMRobertaForMaskedLM(RobertaForMaskedLM): class XLMRobertaForMaskedLM(RobertaForMaskedLM):
""" """
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -88,6 +89,7 @@ class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -102,6 +104,7 @@ class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -116,5 +119,6 @@ class XLMRobertaForTokenClassification(RobertaForTokenClassification):
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
superclass for the appropriate documentation alongside usage examples. superclass for the appropriate documentation alongside usage examples.
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP

View File

@@ -575,7 +575,6 @@ XLNET_INPUTS_DOCSTRING = r"""
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetModel(XLNetPreTrainedModel): class XLNetModel(XLNetPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
@@ -929,7 +928,6 @@ class XLNetModel(XLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetLMHeadModel(XLNetPreTrainedModel): class XLNetLMHeadModel(XLNetPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.attn_type = config.attn_type self.attn_type = config.attn_type
@@ -1060,7 +1058,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForSequenceClassification(XLNetPreTrainedModel): class XLNetForSequenceClassification(XLNetPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -1161,7 +1158,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForTokenClassification(XLNetPreTrainedModel): class XLNetForTokenClassification(XLNetPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -1262,7 +1258,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForMultipleChoice(XLNetPreTrainedModel): class XLNetForMultipleChoice(XLNetPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
@@ -1369,7 +1364,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
@@ -1486,7 +1480,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class XLNetForQuestionAnswering(XLNetPreTrainedModel): class XLNetForQuestionAnswering(XLNetPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.start_n_top = config.start_n_top self.start_n_top = config.start_n_top