Refactor Code samples; Test code samples (#5036)

* Refactor code samples

* Test docstrings

* Style

* Tokenization examples

* Run rust of tests

* First step to testing source docs

* Style and BART comment

* Test the remainder of the code samples

* Style

* let to const

* Formatting fixes

* Ready for merge

* Fix fixture + Style

* Fix last tests

* Update docs/source/quicktour.rst

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Addressing @sgugger's comments + Fix MobileBERT in TF

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Lysandre Debut
2020-06-25 16:46:00 -04:00
committed by GitHub
parent 315f464b0a
commit 364a5ae1f0
68 changed files with 1962 additions and 2979 deletions

View File

@@ -10,3 +10,7 @@
.highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
color: #6670FF;
}
.highlight .gp {
color: #FB8D68;
}

View File

@@ -44,6 +44,7 @@
display: flex;
flex-direction: row;
justify-content: flex-end;
margin-right: 30px;
}
.framework-selector > button {
@@ -60,6 +61,12 @@
padding: 5px;
}
/* Copy button */
a.copybtn {
margin: 3px;
}
/* The literal code blocks */
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
color: #6670FF;

View File

@@ -157,6 +157,8 @@ function platformToggle() {
const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
const pytorchIdentifier = "## PYTORCH CODE";
const tensorflowIdentifier = "## TENSORFLOW CODE";
const promptSpanIdentifier = `<span class="gp">&gt;&gt;&gt; </span>`
const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;
@@ -169,10 +171,22 @@ function platformToggle() {
let tensorflowSpans;
if(pytorchSpanPosition < tensorflowSpanPosition){
pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
const isPrompt = spans.slice(
spans.indexOf(tensorflowSpanIdentifier) - promptSpanIdentifier.length,
spans.indexOf(tensorflowSpanIdentifier)
) == promptSpanIdentifier;
const finalTensorflowSpanPosition = isPrompt ? tensorflowSpanPosition - promptSpanIdentifier.length : tensorflowSpanPosition;
pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, finalTensorflowSpanPosition);
tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
}else{
tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
const isPrompt = spans.slice(
spans.indexOf(pytorchSpanIdentifier) - promptSpanIdentifier.length,
spans.indexOf(pytorchSpanIdentifier)
) == promptSpanIdentifier;
const finalPytorchSpanPosition = isPrompt ? pytorchSpanPosition - promptSpanIdentifier.length : pytorchSpanPosition;
tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, finalPytorchSpanPosition);
pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
}

View File

@@ -44,7 +44,8 @@ extensions = [
'sphinx.ext.napoleon',
'recommonmark',
'sphinx.ext.viewcode',
'sphinx_markdown_tables'
'sphinx_markdown_tables',
'sphinx_copybutton'
]
# Add any paths that contain templates here, relative to this directory.
@@ -74,6 +75,8 @@ exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None
# Remove the prompt when copying examples
copybutton_prompt_text = ">>> "
# -- Options for HTML output -------------------------------------------------

View File

@@ -45,17 +45,16 @@ tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ token
::
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
>>> from transformers import BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence = "A Titan RTX has 24GB of VRAM"
>>> sequence = "A Titan RTX has 24GB of VRAM"
The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
::
tokenized_sequence = tokenizer.tokenize(sequence)
print(tokenized_sequence)
>>> tokenized_sequence = tokenizer.tokenize(sequence)
The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-dash is
@@ -63,6 +62,7 @@ added for "RA" and "M":
::
>>> print(tokenized_sequence)
['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
@@ -71,14 +71,14 @@ the sentence to the tokenizer, which leverages the Rust implementation of
::
encoded_sequence = tokenizer(sequence)["input_ids"]
print(encoded_sequence)
>>> encoded_sequence = tokenizer(sequence)["input_ids"]
The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
token indices are under the key "input_ids":
::
>>> print(encoded_sequence)
[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
Note that the tokenizer automatically adds "special tokens" (if the associated model rely on them) which are special
@@ -86,13 +86,14 @@ IDs the model sometimes uses. If we decode the previous sequence of ids,
::
tokenizer.decode(encoded_sequence)
>>> decoded_sequence = tokenizer.decode(encoded_sequence)
we will see
::
'[CLS] A Titan RTX has 24GB of VRAM [SEP]'
>>> print(decoded_sequence)
[CLS] A Titan RTX has 24GB of VRAM [SEP]
because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
@@ -108,21 +109,20 @@ For example, consider these two sequences:
::
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
>>> from transformers import BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
>>> sequence_a = "This is a short sequence."
>>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
len(encoded_sequence_a), len(encoded_sequence_b)
>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
The encoded versions have different lengths:
::
>>> len(encoded_sequence_a), len(encoded_sequence_b)
(8, 19)
Therefore, we can't be put then together in a same tensor as-is. The first sequence needs to be padded up to the length
@@ -133,15 +133,14 @@ it to pad like this:
::
padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
padded_sequences["input_ids"]
>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
::
[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
>>> padded_sequences["input_ids"]
[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
the position of the padded indices so that the model does not attend to them. For the
@@ -150,14 +149,8 @@ a padded value. This attention mask is in the dictionary returned by the tokeniz
::
padded_sequences["attention_mask"]
will give back
::
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
>>> padded_sequences["attention_mask"]
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
.. _token-type-ids:
@@ -170,26 +163,27 @@ tokens. For example, the BERT model builds its two sequence input as such:
::
# [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
We can use our tokenizer to automatically generate such a sentence by passing the two sequences as two arguments (and
not a list like before) like this:
::
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "HuggingFace is based in NYC"
sequence_b = "Where is HuggingFace based?"
>>> from transformers import BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
>>> sequence_a = "HuggingFace is based in NYC"
>>> sequence_b = "Where is HuggingFace based?"
encoded_dict = tokenizer(sequence_a, sequence_b)
tokenizer.decode(encoded_dict["input_ids"])
>>> encoded_dict = tokenizer(sequence_a, sequence_b)
>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
which will return:
::
"[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
>>> print(decoded)
[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
This is enough for some models to understand where one sequence ends and where another begins. However, other models
such as BERT have an additional mechanism, which are the token type IDs (also called segment IDs). They are a binary
@@ -199,12 +193,7 @@ The tokenizer returns in the dictionary under the key "token_type_ids":
::
encoded_dict['token_type_ids']
will return
::
>>> encoded_dict['token_type_ids']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the

View File

@@ -36,10 +36,11 @@ Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language m
.. code-block::
import torch
from transformers import XLMTokenizer, XLMWithLMHeadModel
>>> import torch
>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
@@ -47,16 +48,15 @@ The different languages this model/tokenizer handles, as well as the ids of thes
.. code-block::
# Continuation of the previous script
print(tokenizer.lang2id) # {'en': 0, 'fr': 1}
>>> print(tokenizer.lang2id)
{'en': 0, 'fr': 1}
These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
.. code-block::
# Continuation of the previous script
input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
We should now define the language embedding by using the previously defined language id. We want to create a tensor
@@ -64,20 +64,18 @@ filled with the appropriate language ids, of the same size as input_ids. For eng
.. code-block::
# Continuation of the previous script
language_id = tokenizer.lang2id['en'] # 0
langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0])
>>> language_id = tokenizer.lang2id['en'] # 0
>>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0])
# We reshape it to be of size (batch_size, sequence_length)
langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
>>> # We reshape it to be of size (batch_size, sequence_length)
>>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
You can then feed it all as input to your model:
.. code-block::
# Continuation of the previous script
outputs = model(input_ids, langs=langs)
>>> outputs = model(input_ids, langs=langs)
The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__

View File

@@ -32,40 +32,33 @@ provides the following tasks out of the box:
Let's see how this work for sentiment analysis (the other tasks are all covered in the
:doc:`task summary </task_summary>`):
::
.. code-block::
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
>>> from transformers import pipeline
>>> classifier = pipeline('sentiment-analysis')
When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
look at both later on, but as an introduction the tokenizer's job is to preprocess the text for the model, which is
then responsible for making predictions. The pipeline groups all of that together, and post-process the predictions to
make them readable. For instance
make them readable. For instance:
::
classifier('We are very happy to show you the 🤗 Transformers library.')
will return something like this:
::
.. code-block::
>>> classifier('We are very happy to show you the 🤗 Transformers library.')
[{'label': 'POSITIVE', 'score': 0.9997795224189758}]
That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
`batch`:
`batch`, returning a list of dictionaries like this one:
::
.. code-block::
classifier(["We are very happy to show you the 🤗 Transformers library.",
"We hope you don't hate it."])
returning a list of dictionaries like this one:
::
[{'label': 'POSITIVE', 'score': 0.9997795224189758},
{'label': 'NEGATIVE', 'score': 0.5308589935302734}]
>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.",
... "We hope you don't hate it."])
>>> for result in results:
... print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309
You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
fairly neutral.
@@ -83,9 +76,9 @@ see how we can use it.
You can directly pass the name of the model to use to :func:`~transformers.pipeline`:
::
.. code-block::
classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
>>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
replace that name by a local folder where you have saved a pretrained model (see below). You can also pass a model
@@ -98,29 +91,30 @@ tokenizer associated to the model we picked and instantiate it. The second is
the model itself. Note that if we were using the library on an other task, the class of the model would change. The
:doc:`task summary </task_summary>` tutorial summarizes which class is used for which task.
::
.. code-block::
## PYTORCH CODE
from transformers import AutoTokenizer, AutoModelForSequenceClassification
## TENSORFLOW CODE
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
>>> ## PYTORCH CODE
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
>>> ## TENSORFLOW CODE
>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
Now, to download the models and tokenizer we found previously, we just have to use the
:func:`~transformers.AutoModelForSequenceClassification.from_pretrained` method (feel free to replace ``model_name`` by
any other model from the model hub):
::
.. code-block::
## PYTORCH CODE
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
## TENSORFLOW CODE
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
>>> ## PYTORCH CODE
>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
>>> pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
>>> ## TENSORFLOW CODE
>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
>>> # This model only exists in PyTorch, so we use the `from_pt` flag to import that model in TensorFlow.
>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
>>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
pretrained model on your data. We provide :doc:`example scripts </examples>` to do so. Once you're done, don't forget
@@ -136,16 +130,16 @@ using the :obj:`from_pretrained` method:
::
## PYTORCH CODE
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
## TENSORFLOW CODE
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
>>> ## PYTORCH CODE
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
>>> ## TENSORFLOW CODE
>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
Using the tokenizer
^^^^^^^^^^^^^^^^^^^
@@ -161,48 +155,56 @@ the model. To do this, the tokenizer has a `vocab`, which is the part we downloa
To apply these steps on a given text, we can just feed it to our tokenizer:
::
.. code-block::
input = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(input)
>>> inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
This returns a dictionary string to list of ints. It contains the `ids of the tokens <glossary.html#input-ids>`__,
as mentioned before, but also additional arguments that will be useful to the model. Here for instance, we also have an
`attention mask <glossary.html#attention-mask>`__ that the model will use to have a better understanding of the sequence:
::
{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
.. code-block::
>>> print(inputs)
{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
and get tensors back. You can specify all of that to the tokenizer:
::
.. code-block::
## PYTORCH CODE
batch = tokenizer(
["We are very happy to show you the 🤗 Transformers library.",
"We hope you don't hate it."],
padding=True, truncation=True, return_tensors="pt")
print(batch)
## TENSORFLOW CODE
batch = tokenizer(
["We are very happy to show you the 🤗 Transformers library.",
"We hope you don't hate it."],
padding=True, truncation=True, return_tensors="tf")
print(batch)
>>> ## PYTORCH CODE
>>> pt_batch = tokenizer(
... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
... padding=True,
... truncation=True,
... return_tensors="pt"
... )
>>> ## TENSORFLOW CODE
>>> tf_batch = tokenizer(
... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
... padding=True,
... truncation=True,
... return_tensors="tf"
... )
The padding is automatically applied on the side the model expect it (in this case, on the right), with the
padding token the model was pretrained with. The attention mask is also adapted to take the padding into account:
::
.. code-block::
{'input_ids': tensor([[ 101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],
[ 101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}
>>> ## PYTORCH CODE
>>> for key, value in pt_batch.items():
... print(f"{key}: {value.numpy().tolist()}")
input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
>>> ## TENSORFLOW CODE
>>> for key, value in tf_batch.items():
... print(f"{key}: {value.numpy().tolist()}")
input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
You can learn more about tokenizers :doc:`here <preprocessing>`.
@@ -213,20 +215,27 @@ Once your input has been preprocessed by the tokenizer, you can directly send it
contain all the relevant information the model needs. If you're using a TensorFlow model, you can directly pass the
dictionary keys to tensor, for a PyTorch model, you need to unpack the dictionary by adding :obj:`**`.
::
.. code-block::
## PYTORCH CODE
outputs = model(**batch)
## TENSORFLOW CODE
outputs = model(batch)
>>> ## PYTORCH CODE
>>> pt_outputs = pt_model(**pt_batch)
>>> ## TENSORFLOW CODE
>>> tf_outputs = tf_model(tf_batch)
In 🤗 Transformers, all outputs are tuples (with only one element potentially). Here, we get a tuple with just the
final activations of the model.
::
.. code-block::
(tensor([[-4.1329, 4.3811],
[ 0.0818, -0.0418]]),)
>>> ## PYTORCH CODE
>>> print(pt_outputs)
(tensor([[-4.0833, 4.3364],
[ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
>>> ## TENSORFLOW CODE
>>> print(tf_outputs)
(<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-4.0832963 , 4.3364134 ],
[ 0.08181238, -0.04178794]], dtype=float32)>,)
.. note::
@@ -235,33 +244,39 @@ final activations of the model.
Let's apply the SoftMax activation to get predictions.
::
.. code-block::
## PYTORCH CODE
import torch.nn.functional as F
predictions = F.softmax(outputs[0], dim=-1)
print(predictions)
## TENSORFLOW CODE
predictions = tf.nn.softmax(outputs[0], axis=-1)
print(predictions)
>>> ## PYTORCH CODE
>>> import torch.nn.functional as F
>>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
>>> ## TENSORFLOW CODE
>>> import tensorflow as tf
>>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
We can see we get the numbers from before:
::
.. code-block::
tensor([[2.0060e-04, 9.9980e-01],
[5.3086e-01, 4.6914e-01]])
>>> ## TENSORFLOW CODE
>>> print(tf_predictions)
tf.Tensor(
[[2.2042994e-04 9.9977952e-01]
[5.3086078e-01 4.6913919e-01]], shape=(2, 2), dtype=float32)
>>> ## PYTORCH CODE
>>> print(pt_predictions)
tensor([[2.2043e-04, 9.9978e-01],
[5.3086e-01, 4.6914e-01]], grad_fn=<SoftmaxBackward>)
If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
::
.. code-block::
## PYTORCH CODE
import torch
outputs = model(**batch, labels = torch.tensor([1, 0])
## TENSORFLOW CODE
import tensorflow as tf
outputs = model(batch, labels = tf.constant([1, 0])
>>> ## PYTORCH CODE
>>> import torch
>>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
>>> ## TENSORFLOW CODE
>>> import tensorflow as tf
>>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or
`tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual
@@ -298,12 +313,12 @@ Lastly, you can also ask the model to return all hidden states and all attention
::
## PYTORCH CODE
outputs = model(**batch, output_hidden_states=True, output_attentions=True)
all_hidden_states, all_attentions = outputs[-2:]
## TENSORFLOW CODE
outputs = model(batch, output_hidden_states=True, output_attentions=True)
all_hidden_states, all_attentions = outputs[-2:]
>>> ## PYTORCH CODE
>>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
>>> all_hidden_states, all_attentions = pt_outputs[-2:]
>>> ## TENSORFLOW CODE
>>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
>>> all_hidden_states, all_attentions = tf_outputs[-2:]
Accessing the code
^^^^^^^^^^^^^^^^^^
@@ -318,18 +333,18 @@ using the :doc:`DistilBERT </model_doc/distilbert>` architecture. The model auto
to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
without the auto magic:
::
.. code-block::
## PYTORCH CODE
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = DistilBertForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
## TENSORFLOW CODE
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
>>> ## PYTORCH CODE
>>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
>>> ## TENSORFLOW CODE
>>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
Customizing the model
^^^^^^^^^^^^^^^^^^^^^
@@ -345,18 +360,18 @@ Here we use the predefined vocabulary of DistilBERT (hence load the tokenizer wi
instantiate the model from the configuration instead of using the
:func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method).
::
.. code-block::
## PYTORCH CODE
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification(config)
## TENSORFLOW CODE
from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification(config)
>>> ## PYTORCH CODE
>>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
>>> model = DistilBertForSequenceClassification(config)
>>> ## TENSORFLOW CODE
>>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
>>> model = TFDistilBertForSequenceClassification(config)
For something that only changes the head of the model (for instance, the number of labels), you can still use a
pretrained model for the body. For instance, let's define a classifier for 10 different labels using a pretrained body.
@@ -364,15 +379,15 @@ We could create a configuration with all the default values and just change the
can directly pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the
default configuration with it:
::
.. code-block::
## PYTORCH CODE
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
model_name = "distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
## TENSORFLOW CODE
from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
model_name = "distilbert-base-uncased"
model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
>>> ## PYTORCH CODE
>>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased"
>>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
>>> ## TENSORFLOW CODE
>>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased"
>>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)

File diff suppressed because it is too large Load Diff

View File

@@ -86,7 +86,7 @@ extras["all"] = extras["serving"] + ["tensorflow", "torch"]
extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "psutil"]
# sphinx-rtd-theme==0.5.0 introduced big changes in the style.
extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3"]
extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
extras["quality"] = [
"black",
"isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",

View File

@@ -81,22 +81,22 @@ class AlbertConfig(PretrainedConfig):
Example::
from transformers import AlbertConfig, AlbertModel
# Initializing an ALBERT-xxlarge style configuration
albert_xxlarge_configuration = AlbertConfig()
>>> from transformers import AlbertConfig, AlbertModel
>>> # Initializing an ALBERT-xxlarge style configuration
>>> albert_xxlarge_configuration = AlbertConfig()
# Initializing an ALBERT-base style configuration
albert_base_configuration = AlbertConfig(
hidden_size=768,
num_attention_heads=12,
intermediate_size=3072,
)
>>> # Initializing an ALBERT-base style configuration
>>> albert_base_configuration = AlbertConfig(
... hidden_size=768,
... num_attention_heads=12,
... intermediate_size=3072,
... )
# Initializing a model from the ALBERT-base style configuration
model = AlbertModel(albert_xxlarge_configuration)
>>> # Initializing a model from the ALBERT-base style configuration
>>> model = AlbertModel(albert_xxlarge_configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "albert"

View File

@@ -73,9 +73,13 @@ class BartConfig(PretrainedConfig):
):
r"""
:class:`~transformers.BartConfig` is the configuration class for `BartModel`.
Examples:
config = BartConfig.from_pretrained('bart-large')
model = BartModel(config)
Examples::
>>> from transformers import BartConfig, BartModel
>>> config = BartConfig.from_pretrained('facebook/bart-large')
>>> model = BartModel(config)
"""
if "hidden_size" in common_kwargs:
raise ValueError("hidden size is called d_model")

View File

@@ -95,16 +95,16 @@ class BertConfig(PretrainedConfig):
Example::
from transformers import BertModel, BertConfig
>>> from transformers import BertModel, BertConfig
# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()
>>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig()
# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)
>>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "bert"

View File

@@ -66,16 +66,16 @@ class CTRLConfig(PretrainedConfig):
Example::
from transformers import CTRLModel, CTRLConfig
>>> from transformers import CTRLModel, CTRLConfig
# Initializing a CTRL configuration
configuration = CTRLConfig()
>>> # Initializing a CTRL configuration
>>> configuration = CTRLConfig()
# Initializing a model from the configuration
model = CTRLModel(configuration)
>>> # Initializing a model from the configuration
>>> model = CTRLModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "ctrl"

View File

@@ -80,16 +80,16 @@ class DistilBertConfig(PretrainedConfig):
Example::
from transformers import DistilBertModel, DistilBertConfig
>>> from transformers import DistilBertModel, DistilBertConfig
# Initializing a DistilBERT configuration
configuration = DistilBertConfig()
>>> # Initializing a DistilBERT configuration
>>> configuration = DistilBertConfig()
# Initializing a model from the configuration
model = DistilBertModel(configuration)
>>> # Initializing a model from the configuration
>>> model = DistilBertModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "distilbert"

View File

@@ -101,16 +101,16 @@ class ElectraConfig(PretrainedConfig):
Example::
from transformers import ElectraModel, ElectraConfig
>>> from transformers import ElectraModel, ElectraConfig
# Initializing a ELECTRA electra-base-uncased style configuration
configuration = ElectraConfig()
>>> # Initializing a ELECTRA electra-base-uncased style configuration
>>> configuration = ElectraConfig()
# Initializing a model from the electra-base-uncased style configuration
model = ElectraModel(configuration)
>>> # Initializing a model from the electra-base-uncased style configuration
>>> model = ElectraModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "electra"

View File

@@ -42,20 +42,20 @@ class EncoderDecoderConfig(PretrainedConfig):
Example::
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
# Initializing a BERT bert-base-uncased style configuration
config_encoder = BertConfig()
config_decoder = BertConfig()
>>> # Initializing a BERT bert-base-uncased style configuration
>>> config_encoder = BertConfig()
>>> config_decoder = BertConfig()
config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
# Initializing a Bert2Bert model from the bert-base-uncased style configurations
model = EncoderDecoderModel(config=config)
>>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
>>> model = EncoderDecoderModel(config=config)
# Accessing the model configuration
config_encoder = model.config.encoder
config_decoder = model.config.decoder
>>> # Accessing the model configuration
>>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder
"""
model_type = "encoder_decoder"

View File

@@ -100,16 +100,16 @@ class GPT2Config(PretrainedConfig):
Example::
from transformers import GPT2Model, GPT2Config
>>> from transformers import GPT2Model, GPT2Config
# Initializing a GPT2 configuration
configuration = GPT2Config()
>>> # Initializing a GPT2 configuration
>>> configuration = GPT2Config()
# Initializing a model from the configuration
model = GPT2Model(configuration)
>>> # Initializing a model from the configuration
>>> model = GPT2Model(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "gpt2"

View File

@@ -49,16 +49,16 @@ class LongformerConfig(RobertaConfig):
Example::
from transformers import LongformerConfig, LongformerModel
>>> from transformers import LongformerConfig, LongformerModel
# Initializing a Longformer configuration
configuration = LongformerConfig()
>>> # Initializing a Longformer configuration
>>> configuration = LongformerConfig()
# Initializing a model from the configuration
model = LongformerModel(configuration)
>>> # Initializing a model from the configuration
>>> model = LongformerModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "longformer"

View File

@@ -85,16 +85,16 @@ class MobileBertConfig(PretrainedConfig):
Example:
from transformers import MobileBertModel, MobileBertConfig
>>> from transformers import MobileBertModel, MobileBertConfig
# Initializing a MobileBERT configuration
configuration = MobileBertConfig()
>>> # Initializing a MobileBERT configuration
>>> configuration = MobileBertConfig()
# Initializing a model from the configuration above
model = MobileBertModel(configuration)
>>> # Initializing a model from the configuration above
>>> model = MobileBertModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):

View File

@@ -98,16 +98,16 @@ class OpenAIGPTConfig(PretrainedConfig):
Example::
from transformers import OpenAIGPTConfig, OpenAIGPTModel
>>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
# Initializing a GPT configuration
configuration = OpenAIGPTConfig()
>>> # Initializing a GPT configuration
>>> configuration = OpenAIGPTConfig()
# Initializing a model from the configuration
model = OpenAIGPTModel(configuration)
>>> # Initializing a model from the configuration
>>> model = OpenAIGPTModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "openai-gpt"

View File

@@ -125,16 +125,16 @@ class ReformerConfig(PretrainedConfig):
Example::
from transformers import ReformerModel, ReformerConfig
>>> from transformers import ReformerModel, ReformerConfig
# Initializing a Reformer configuration
configuration = ReformerConfig()
>>> # Initializing a Reformer configuration
>>> configuration = ReformerConfig()
# Initializing a Reformer model
model = ReformerModel(configuration)
>>> # Initializing a Reformer model
>>> model = ReformerModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "reformer"

View File

@@ -49,16 +49,16 @@ class RobertaConfig(BertConfig):
Example::
from transformers import RobertaConfig, RobertaModel
>>> from transformers import RobertaConfig, RobertaModel
# Initializing a RoBERTa configuration
configuration = RobertaConfig()
>>> # Initializing a RoBERTa configuration
>>> configuration = RobertaConfig()
# Initializing a model from the configuration
model = RobertaModel(configuration)
>>> # Initializing a model from the configuration
>>> model = RobertaModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "roberta"

View File

@@ -100,16 +100,16 @@ class TransfoXLConfig(PretrainedConfig):
Example::
from transformers import TransfoXLConfig, TransfoXLModel
>>> from transformers import TransfoXLConfig, TransfoXLModel
# Initializing a Transformer XL configuration
configuration = TransfoXLConfig()
>>> # Initializing a Transformer XL configuration
>>> configuration = TransfoXLConfig()
# Initializing a model from the configuration
model = TransfoXLModel(configuration)
>>> # Initializing a model from the configuration
>>> model = TransfoXLModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "transfo-xl"

View File

@@ -142,16 +142,16 @@ class XLMConfig(PretrainedConfig):
Example::
from transformers import XLMConfig, XLMModel
>>> from transformers import XLMConfig, XLMModel
# Initializing a XLM configuration
configuration = XLMConfig()
>>> # Initializing a XLM configuration
>>> configuration = XLMConfig()
# Initializing a model from the configuration
model = XLMModel(configuration)
>>> # Initializing a model from the configuration
>>> model = XLMModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "xlm"

View File

@@ -113,16 +113,16 @@ class XLNetConfig(PretrainedConfig):
Example::
from transformers import XLNetConfig, XLNetModel
>>> from transformers import XLNetConfig, XLNetModel
# Initializing a XLNet configuration
configuration = XLNetConfig()
>>> # Initializing a XLNet configuration
>>> configuration = XLNetConfig()
# Initializing a model from the configuration
model = XLNetModel(configuration)
>>> # Initializing a model from the configuration
>>> model = XLNetModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "xlnet"

View File

@@ -488,11 +488,11 @@ class SquadProcessor(DataProcessor):
Examples::
import tensorflow_datasets as tfds
dataset = tfds.load("squad")
>>> import tensorflow_datasets as tfds
>>> dataset = tfds.load("squad")
training_examples = get_examples_from_dataset(dataset, evaluate=False)
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
>>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
>>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
"""
if evaluate:

View File

@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr):
return docstring_decorator
PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels)
>>> loss, scores = outputs[:2]
"""
PT_QUESTION_ANSWERING_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:3]
"""
PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels)
>>> loss, logits = outputs[:2]
"""
PT_MASKED_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
>>> outputs = model(input_ids, labels=input_ids)
>>> loss, prediction_scores = outputs[:2]
"""
PT_BASE_MODEL_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
PT_MULTIPLE_CHOICE_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
>>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> loss, logits = outputs[:2]
"""
PT_CAUSAL_LM_SAMPLE = r"""
Example::
>>> import torch
>>> from transformers import {tokenizer_class}, {model_class}
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs, labels=inputs["input_ids"])
>>> loss, logits = outputs[:2]
"""
TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> input_ids = inputs["input_ids"]
>>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
>>> outputs = model(inputs)
>>> loss, scores = outputs[:2]
"""
TF_QUESTION_ANSWERING_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> input_dict = tokenizer(question, text, return_tensors='tf')
>>> start_scores, end_scores = model(input_dict)
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
>>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
>>> outputs = model(inputs)
>>> loss, logits = outputs[:2]
"""
TF_MASKED_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores = outputs[0]
"""
TF_BASE_MODEL_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
TF_MULTIPLE_CHOICE_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
>>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
>>> outputs = model(inputs) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> logits = outputs[0]
"""
TF_CAUSAL_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs)
>>> logits = outputs[0]
"""
def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
def docstring_decorator(fn):
model_class = fn.__qualname__.split(".")[0]
is_tf_class = model_class[:2] == "TF"
if "SequenceClassification" in model_class:
code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
elif "QuestionAnswering" in model_class:
code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
elif "TokenClassification" in model_class:
code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
elif "MultipleChoice" in model_class:
code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
elif "MaskedLM" in model_class:
code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
elif "LMHead" in model_class:
code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
elif "Model" in model_class:
code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
else:
raise ValueError(f"Docstring can't be built for model {model_class}")
built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
return fn
return docstring_decorator
def is_remote_url(url_or_filename):
parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https")

View File

@@ -24,13 +24,15 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_albert import AlbertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "AlbertTokenizer"
ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert-base-v1",
@@ -485,6 +487,7 @@ class AlbertModel(AlbertPreTrainedModel):
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
@@ -521,18 +524,6 @@ class AlbertModel(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Example::
from transformers import AlbertModel, AlbertTokenizer
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -657,16 +648,16 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
Examples::
from transformers import AlbertTokenizer, AlbertForPreTraining
import torch
>>> from transformers import AlbertTokenizer, AlbertForPreTraining
>>> import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForPreTraining.from_pretrained('albert-base-v2')
>>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
>>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
prediction_scores, sop_scores = outputs[:2]
>>> prediction_scores, sop_scores = outputs[:2]
"""
@@ -763,6 +754,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
return self.predictions.decoder
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
@@ -802,18 +794,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Example::
from transformers import AlbertTokenizer, AlbertForMaskedLM
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
@@ -863,6 +843,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
@@ -899,19 +880,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.albert(
@@ -962,6 +930,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
@@ -996,21 +965,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import AlbertTokenizer, AlbertForTokenClassification
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForTokenClassification.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.albert(
@@ -1062,6 +1016,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
@@ -1104,21 +1059,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
start_scores, end_scores = model(**input_dict)
"""
outputs = self.albert(
@@ -1176,6 +1116,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
@@ -1213,25 +1154,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import AlbertTokenizer, AlbertForMultipleChoice
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMultipleChoice.from_pretrained('albert-base-v2')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

View File

@@ -392,8 +392,8 @@ class AutoModel:
Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
>>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
>>> model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
for config_class, model_class in MODEL_MAPPING.items():
if isinstance(config, config_class):
@@ -480,8 +480,7 @@ class AutoModel:
Examples::
model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
assert model.config.output_attention == True
assert model.config.output_attentions == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
@@ -547,8 +546,8 @@ class AutoModelForPreTraining:
Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
>>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
>>> model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
if isinstance(config, config_class):

View File

@@ -27,12 +27,19 @@ from torch.nn import CrossEntropyLoss
from .activations import ACT2FN
from .configuration_bart import BartConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_utils import PreTrainedModel
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BartTokenizer"
BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/bart-large",
@@ -56,14 +63,17 @@ BART_START_DOCSTRING = r"""
"""
BART_GENERATION_EXAMPLE = r"""
Examples::
Summarization example::
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
# see ``examples/summarization/bart/run_eval.py`` for a longer example
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
@@ -807,6 +817,7 @@ class BartModel(PretrainedBartModel):
self.init_weights()
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward(
self,
input_ids,
@@ -883,8 +894,7 @@ class BartModel(PretrainedBartModel):
@add_start_docstrings(
"The BART Model with a language modeling head. Can be used for summarization.",
BART_START_DOCSTRING + BART_GENERATION_EXAMPLE,
"The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
)
class BartForConditionalGeneration(PretrainedBartModel):
base_model_prefix = "model"
@@ -911,6 +921,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
self.register_buffer("final_logits_bias", new_bias)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_end_docstrings(BART_GENERATION_EXAMPLE)
def forward(
self,
input_ids,
@@ -951,18 +962,21 @@ class BartForConditionalGeneration(PretrainedBartModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Conditional generation example::
# Mask filling only works for bart-large
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained('bart-large')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
TXT = "My friends are <mask> but they eat too many carbs."
model = BartForConditionalGeneration.from_pretrained('bart-large')
input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids']
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
logits = model(input_ids)[0]
masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tokenizer.decode(predictions).split()
# ['good', 'great', 'all', 'really', 'very']
"""
@@ -1068,6 +1082,7 @@ class BartForSequenceClassification(PretrainedBartModel):
self.model._init_weights(self.classification_head.out_proj)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward(
self,
input_ids,
@@ -1088,32 +1103,19 @@ class BartForSequenceClassification(PretrainedBartModel):
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BartConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification loss (cross entropy)
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BartTokenizer, BartForSequenceClassification
import torch
tokenizer = BartTokenizer.from_pretrained('bart-large')
model = BartForSequenceClassification.from_pretrained('bart-large')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute",
add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification loss (cross entropy)
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the
self-attention
heads.
"""
if labels is not None:
use_cache = False
@@ -1161,6 +1163,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
self.model._init_weights(self.qa_outputs)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward(
self,
input_ids,
@@ -1200,25 +1203,6 @@ class BartForQuestionAnswering(PretrainedBartModel):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint bart-large is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import BartTokenizer, BartForQuestionAnswering
import torch
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForQuestionAnswering.from_pretrained('facebook/bart-large')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
start_scores, end_scores = model(torch.tensor([input_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
"""
if start_positions is not None and end_positions is not None:
use_cache = False
@@ -1259,7 +1243,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
return outputs # return outputs # (loss), start_logits, end_logits, encoder_outputs, (hidden_states), (attentions)
class SinusoidalPositionalEmbedding(nn.Embedding):

View File

@@ -28,12 +28,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .activations import gelu, gelu_new, swish
from .configuration_bert import BertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BertTokenizer"
BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bert-base-uncased",
"bert-large-uncased",
@@ -664,6 +666,7 @@ class BertModel(BertPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
@@ -702,20 +705,6 @@ class BertModel(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertModel, BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -851,16 +840,16 @@ class BertForPreTraining(BertPreTrainedModel):
Examples::
from transformers import BertTokenizer, BertForPreTraining
import torch
>>> from transformers import BertTokenizer, BertForPreTraining
>>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> prediction_scores, seq_relationship_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
@@ -958,19 +947,20 @@ class BertLMHeadModel(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Example::
from transformers import BertTokenizer, BertLMHeadModel
import torch
>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
>>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertLMHeadModel.from_pretrained('bert-base-uncased', is_decoder=True)
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
>>> config = BertConfig.from_pretrained("bert-base-cased")
>>> config.is_decoder = True
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.bert(
@@ -1028,6 +1018,7 @@ class BertForMaskedLM(BertPreTrainedModel):
return self.cls.predictions.decoder
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
@@ -1069,20 +1060,6 @@ class BertForMaskedLM(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForMaskedLM
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
@@ -1185,18 +1162,18 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
Examples::
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
>>> from transformers import BertTokenizer, BertForNextSentencePrediction
>>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
assert logits[0, 0] < logits[0, 1] # next sentence was random
>>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
"""
outputs = self.bert(
@@ -1240,6 +1217,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
@@ -1276,21 +1254,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.bert(
@@ -1340,6 +1303,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
@@ -1377,25 +1341,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForMultipleChoice
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1453,6 +1398,7 @@ class BertForTokenClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
@@ -1487,21 +1433,6 @@ class BertForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForTokenClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.bert(
@@ -1554,6 +1485,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
@@ -1596,25 +1528,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text)
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
assert answer == "a nice puppet"
"""
outputs = self.bert(

View File

@@ -31,6 +31,8 @@ from .modeling_roberta import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CamembertTokenizer"
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"camembert-base",
"Musixmatch/umberto-commoncrawl-cased-v1",

View File

@@ -24,12 +24,14 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss
from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CTRLTokenizer"
CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ctrl"
# See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -326,6 +328,7 @@ class CTRLModel(CTRLPreTrainedModel):
self.h[layer].multi_head_attention.prune_heads(heads)
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def forward(
self,
input_ids=None,
@@ -358,20 +361,6 @@ class CTRLModel(CTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import CTRLTokenizer, CTRLModel
import torch
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -510,6 +499,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def forward(
self,
input_ids=None,
@@ -552,19 +542,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,

View File

@@ -30,12 +30,13 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu
from .configuration_distilbert import DistilBertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"distilbert-base-uncased",
@@ -409,6 +410,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
self.transformer.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
@@ -434,20 +436,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertModel
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -506,6 +494,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
return self.vocab_projector
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
@@ -544,17 +533,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
@@ -604,6 +582,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
@@ -639,18 +618,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
distilbert_output = self.distilbert(
input_ids=input_ids,
@@ -697,6 +664,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
@@ -737,20 +705,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:3]
"""
distilbert_output = self.distilbert(
input_ids=input_ids,
@@ -806,6 +760,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
@@ -838,19 +793,6 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.distilbert(
@@ -940,22 +882,23 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
Examples::
from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
import torch
>>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
>>> import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
>>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
>>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
>>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> loss, logits = outputs[:2]
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

View File

@@ -8,13 +8,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .activations import get_activation
from .configuration_electra import ElectraConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel
from .modeling_utils import SequenceSummary
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator",
@@ -264,6 +265,7 @@ class ElectraModel(ElectraPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
@@ -291,20 +293,6 @@ class ElectraModel(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraModel, ElectraTokenizer
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraModel.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -383,6 +371,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
@@ -419,21 +408,6 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = ElectraTokenizer.from_pretrained('bert-base-uncased')
model = ElectraForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
discriminator_hidden_states = self.electra(
input_ids,
@@ -521,16 +495,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
Examples::
from transformers import ElectraTokenizer, ElectraForPreTraining
import torch
>>> from transformers import ElectraTokenizer, ElectraForPreTraining
>>> import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
>>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
>>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> scores = model(input_ids)[0]
"""
@@ -589,6 +561,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
def forward(
self,
input_ids=None,
@@ -628,20 +601,6 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForMaskedLM
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
@@ -696,6 +655,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
@@ -730,21 +690,6 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForTokenClassification
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
discriminator_hidden_states = self.electra(
@@ -802,6 +747,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
@@ -844,23 +790,6 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForQuestionAnswering
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForQuestionAnswering.from_pretrained('google/electra-base-discriminator')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
input_ids, token_type_ids = encoding['input_ids'], encoding['token_type_ids']
start_scores, end_scores = model(input_ids, token_type_ids=token_type_ids)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
"""
discriminator_hidden_states = self.electra(
@@ -918,6 +847,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
@@ -954,25 +884,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForMultipleChoice
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForMultipleChoice.from_pretrained('google/electra-base-discriminator')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;))
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

View File

@@ -126,9 +126,8 @@ class EncoderDecoderModel(PreTrainedModel):
Examples::
from transformers import EncoderDecoder
model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
>>> from transformers import EncoderDecoderModel
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
"""
kwargs_encoder = {
@@ -244,21 +243,21 @@ class EncoderDecoderModel(PreTrainedModel):
Examples::
from transformers import EncoderDecoderModel, BertTokenizer
import torch
>>> from transformers import EncoderDecoderModel, BertTokenizer
>>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
# forward
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
>>> # forward
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
# training
loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2]
>>> # training
>>> loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)[:2]
# generation
generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
>>> # generation
>>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
"""

View File

@@ -22,7 +22,7 @@ import torch
from torch.nn import functional as F
from .configuration_flaubert import FlaubertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_xlm import (
XLMForQuestionAnswering,
XLMForQuestionAnsweringSimple,
@@ -35,6 +35,8 @@ from .modeling_xlm import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"flaubert/flaubert_small_cased",
"flaubert/flaubert_base_uncased",
@@ -119,6 +121,7 @@ class FlaubertModel(XLMModel):
self.pre_norm = getattr(config, "pre_norm", False)
@add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="flaubert/flaubert_base_cased")
def forward(
self,
input_ids=None,
@@ -149,18 +152,6 @@ class FlaubertModel(XLMModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import FlaubertTokenizer, FlaubertModel
import torch
tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
model = FlaubertModel.from_pretrained('flaubert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (

View File

@@ -26,7 +26,7 @@ from torch.nn import CrossEntropyLoss
from .activations import ACT2FN
from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import (
Conv1D,
PreTrainedModel,
@@ -38,6 +38,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"gpt2",
"gpt2-medium",
@@ -370,6 +372,7 @@ class GPT2Model(GPT2PreTrainedModel):
self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def forward(
self,
input_ids=None,
@@ -403,18 +406,6 @@ class GPT2Model(GPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import GPT2Tokenizer, GPT2Model
import torch
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -553,6 +544,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def forward(
self,
input_ids=None,
@@ -595,19 +587,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
@@ -721,26 +700,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Examples::
import torch
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
>>> import torch
>>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
# Add a [CLS] to the vocabulary (we should train it also!)
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
>>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoded_choices = [tokenizer.encode(s) for s in choices]
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
>>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
>>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
if "lm_labels" in kwargs:

View File

@@ -24,13 +24,15 @@ from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn import functional as F
from .configuration_longformer import LongformerConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertPreTrainedModel
from .modeling_roberta import RobertaLMHead, RobertaModel
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "LongformerTokenizer"
LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"allenai/longformer-base-4096",
"allenai/longformer-large-4096",
@@ -609,22 +611,22 @@ class LongformerModel(RobertaModel):
Examples::
import torch
from transformers import LongformerModel, LongformerTokenizer
>>> import torch
>>> from transformers import LongformerModel, LongformerTokenizer
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
>>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example,
# classification: the <s> token
# QA: question tokens
# LM: potentially on the beginning of sentences and paragraphs
sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
>>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
>>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
>>> attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example,
... # classification: the <s> token
... # QA: question tokens
... # LM: potentially on the beginning of sentences and paragraphs
>>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -743,18 +745,18 @@ class LongformerForMaskedLM(BertPreTrainedModel):
Examples::
import torch
from transformers import LongformerForMaskedLM, LongformerTokenizer
>>> import torch
>>> from transformers import LongformerForMaskedLM, LongformerTokenizer
model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
>>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
# check ``LongformerModel.forward`` for more details how to set `attention_mask`
loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
>>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
... # check ``LongformerModel.forward`` for more details how to set `attention_mask`
>>> loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
"""
if "masked_lm_labels" in kwargs:
@@ -807,6 +809,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward(
self,
input_ids=None,
@@ -843,19 +846,6 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import LongformerTokenizer, LongformerForSequenceClassification
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if global_attention_mask is None:
@@ -973,25 +963,25 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
Examples::
from transformers import LongformerTokenizer, LongformerForQuestionAnswering
import torch
>>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
>>> import torch
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
>>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
>>> input_ids = encoding["input_ids"]
# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]
>>> # default is local attention everywhere
>>> # the forward method will automatically set global attention on question tokens
>>> attention_mask = encoding["attention_mask"]
start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
>>> start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
>>> answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
>>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
"""
@@ -1060,6 +1050,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward(
self,
input_ids=None,
@@ -1094,19 +1085,6 @@ class LongformerForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import LongformerTokenizer, LongformerForTokenClassification
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.longformer(
@@ -1163,6 +1141,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward(
self,
input_ids=None,
@@ -1200,23 +1179,6 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import LongformerTokenizer, LongformerForMultipleChoice
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096')
# context = "The dog is cute" | choice = "the dog" / "the cat"
choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")]
input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
# global attention is automatically put on "the dog" and "the cat"
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

View File

@@ -31,18 +31,18 @@ class MarianMTModel(BartForConditionalGeneration):
Examples::
from transformers import MarianTokenizer, MarianMTModel
from typing import List
src = 'fr' # source language
trg = 'en' # target language
sample_text = "où est l'arrêt de bus ?"
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
>>> from transformers import MarianTokenizer, MarianMTModel
>>> from typing import List
>>> src = 'fr' # source language
>>> trg = 'en' # target language
>>> sample_text = "où est l'arrêt de bus ?"
>>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
model = MarianMTModel.from_pretrained(mname)
tok = MarianTokenizer.from_pretrained(mname)
batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference
gen = model.generate(**batch) # for forward pass: model(**batch)
words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?"
>>> model = MarianMTModel.from_pretrained(mname)
>>> tok = MarianTokenizer.from_pretrained(mname)
>>> batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference
>>> gen = model.generate(**batch) # for forward pass: model(**batch)
>>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?"
"""

View File

@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate
from .activations import gelu, gelu_new, swish
from .configuration_mobilebert import MobileBertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertModel, MobileBertTokenizer
import torch
tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertModel.from_pretrained(model_name_or_path)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForPreTraining
import torch
tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertForPreTraining.from_pretrained(model_name_or_path)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
>>> import torch
>>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
>>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2]
"""
outputs = self.mobilebert(
@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForMaskedLM
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
Examples::
from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
import torch
>>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
>>> import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
assert logits[0, 0] < logits[0, 1] # next sentence was random
>>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
"""
outputs = self.mobilebert(
@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.mobilebert(
@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForQuestionAnswering
import torch
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertForQuestionAnswering.from_pretrained(model_name_or_path)
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text)
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
assert answer == "a nice puppet"
"""
outputs = self.mobilebert(
@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForMultipleChoice
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForTokenClassification
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.mobilebert(

View File

@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu_new, swish
from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import (
Conv1D,
PreTrainedModel,
@@ -40,6 +40,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-gpt",
# See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def forward(
self,
input_ids=None,
@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
return self.lm_head
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def forward(
self,
input_ids=None,
@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,

View File

@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu, gelu_fast, gelu_new, swish
from .configuration_reformer import ReformerConfig
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ReformerTokenizer"
REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/reformer-crime-and-punishment",
"google/reformer-enwik8",
@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
def forward(
self,
input_ids=None,
@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ReformerModel, ReformerTokenizer
import torch
tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
pass
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
def forward(
self,
input_ids=None,
@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ReformerModelWithLMHead, ReformerTokenizer
import torch
tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
reformer_outputs = self.reformer(

View File

@@ -24,12 +24,14 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_roberta import RobertaConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"roberta-base",
"roberta-large",
@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.roberta(
input_ids,
@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForMultipleChoice
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForTokenClassification
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.roberta(
@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint roberta-large is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
start_scores, end_scores = model(torch.tensor([input_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
"""
outputs = self.roberta(

View File

@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "T5Tokenizer"
####################################################
# This dict contrains shortcut names and associated url
# for the pretrained weights provided with the models
@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Example::
from transformers import T5Tokenizer, T5Model
>>> from transformers import T5Tokenizer, T5Model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5Model.from_pretrained('t5-small')
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5Model.from_pretrained('t5-small')
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
Examples::
from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
>>> loss, prediction_scores = outputs[:2]
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model.generate(input_ids)
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
>>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model.generate(input_ids)
"""
if "lm_labels" in kwargs:

View File

@@ -21,7 +21,12 @@ import logging
import tensorflow as tf
from .configuration_albert import AlbertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
@@ -39,6 +44,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "AlbertTokenizer"
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert-base-v1",
"albert-large-v1",
@@ -713,6 +720,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
self.albert = TFAlbertMainLayer(config, name="albert")
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs):
r"""
Returns:
@@ -737,18 +745,6 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertModel.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.albert(inputs, **kwargs)
return outputs
@@ -837,6 +833,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
return self.albert.embeddings
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs):
r"""
Returns:
@@ -854,18 +851,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMaskedLM
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.albert(inputs, **kwargs)
@@ -895,6 +880,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs=None,
@@ -930,19 +916,6 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -994,6 +967,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs=None,
@@ -1027,19 +1001,6 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForTokenClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -1089,6 +1050,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs=None,
@@ -1130,24 +1092,6 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForQuestionAnswering
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1213,6 +1157,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs,
@@ -1249,22 +1194,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMultipleChoice
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tokenizer(choices, add_special_tokens=True, return_tensors='tf', truncation=True, padding=True)[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]

View File

@@ -22,7 +22,12 @@ import numpy as np
import tensorflow as tf
from .configuration_bert import BertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BertTokenizer"
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bert-base-uncased",
@@ -704,6 +710,7 @@ class TFBertModel(TFBertPreTrainedModel):
self.bert = TFBertMainLayer(config, name="bert")
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs):
r"""
Returns:
@@ -728,18 +735,6 @@ class TFBertModel(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.bert(inputs, **kwargs)
return outputs
@@ -819,6 +814,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
return self.bert.embeddings
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -836,18 +832,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.bert(inputs, **kwargs)
@@ -930,6 +914,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
@@ -965,19 +950,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -1037,6 +1009,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs,
@@ -1073,22 +1046,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMultipleChoice
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
@@ -1177,6 +1134,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
@@ -1210,19 +1168,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForTokenClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -1273,6 +1218,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
@@ -1314,22 +1260,6 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForQuestionAnswering
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
assert answer == "a nice puppet"
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions

View File

@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf
from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import (
TFPreTrainedModel,
TFSharedEmbeddings,
@@ -35,6 +35,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CtrlTokenizer"
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ctrl"
# See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -489,6 +491,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
self.transformer = TFCTRLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -510,18 +513,6 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import CTRLTokenizer, TFCTRLModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = TFCTRLModel.from_pretrained('ctrl')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
@@ -569,6 +560,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -590,19 +582,6 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import CTRLTokenizer, TFCTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = TFCTRLLMHeadModel.from_pretrained('ctrl')
input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
outputs = model(input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]

View File

@@ -23,7 +23,12 @@ import numpy as np
import tensorflow as tf
from .configuration_distilbert import DistilBertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
@@ -41,6 +46,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"distilbert-base-uncased",
@@ -575,6 +581,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs):
r"""
Returns:
@@ -592,17 +599,6 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.distilbert(inputs, **kwargs)
return outputs
@@ -647,6 +643,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
return self.vocab_projector.input_embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs):
r"""
@@ -665,18 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
distilbert_output = self.distilbert(inputs, **kwargs)
@@ -713,6 +698,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs=None,
@@ -746,19 +732,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels
@@ -809,6 +782,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs=None,
@@ -840,19 +814,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels
@@ -916,6 +877,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs,
@@ -950,22 +912,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
@@ -1046,6 +992,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs=None,
@@ -1085,21 +1032,6 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[6] if len(inputs) > 6 else start_positions

View File

@@ -4,7 +4,7 @@ import tensorflow as tf
from transformers import ElectraConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from .modeling_tf_utils import (
TFQuestionAnsweringLoss,
@@ -18,6 +18,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator",
@@ -383,6 +384,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
self.electra = TFElectraMainLayer(config, name="electra")
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call(self, inputs, **kwargs):
r"""
Returns:
@@ -400,17 +402,6 @@ class TFElectraModel(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraModel
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.electra(inputs, **kwargs)
return outputs
@@ -532,6 +523,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
def call(
self,
input_ids=None,
@@ -560,18 +552,6 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForMaskedLM
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
generator_hidden_states = self.electra(
@@ -611,6 +591,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call(
self,
inputs=None,
@@ -644,19 +625,6 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForTokenClassification
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -705,6 +673,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call(
self,
inputs=None,
@@ -746,22 +715,6 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions

View File

@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf
from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import (
TFConv1D,
TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"gpt2",
"gpt2-medium",
@@ -490,6 +492,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
self.transformer = TFGPT2MainLayer(config, name="transformer")
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -511,18 +514,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
@@ -549,6 +540,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -570,19 +562,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]
@@ -659,29 +638,26 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
Examples::
# For example purposes. Not runnable.
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
>>> import tensorflow as tf
>>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
# Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0
raise NotImplementedError
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
>>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoded_choices = [tokenizer.encode(s) for s in choices]
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
>>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
>>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):

View File

@@ -21,7 +21,12 @@ import logging
import tensorflow as tf
from . import MobileBertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"mobilebert-uncased",
@@ -621,19 +627,6 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer):
return prediction_scores
class TFMobileBertPreTrainingHeads(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.predictions = TFMobileBertLMPredictionHead(config, name="predictions")
self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship")
def call(self, inputs):
sequence_output, pooled_output = inputs
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
@keras_serializable
class TFMobileBertMainLayer(tf.keras.layers.Layer):
config_class = MobileBertConfig
@@ -845,6 +838,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs):
r"""
Returns:
@@ -869,18 +863,6 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertModel
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertModel.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.mobilebert(inputs, **kwargs)
return outputs
@@ -895,7 +877,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.cls = TFMobileBertPreTrainingHeads(config, name="cls")
self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
self.seq_relationship = TFMobileBertOnlyNSPHead(2, name="seq_relationship___cls")
def get_output_embeddings(self):
return self.mobilebert.embeddings
@@ -923,20 +906,21 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
>>> import tensorflow as tf
>>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForPreTraining.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2]
"""
outputs = self.mobilebert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2]
prediction_scores, seq_relationship_score = self.cls([sequence_output, pooled_output])
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[
2:
] # add hidden states and attention if they are here
@@ -956,6 +940,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
return self.mobilebert.embeddings
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -973,18 +958,6 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForMaskedLM
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.mobilebert(inputs, **kwargs)
@@ -1015,7 +988,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.cls = TFMobileBertOnlyNSPHead(config, name="cls")
self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
def call(self, inputs, **kwargs):
@@ -1038,18 +1011,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
>>> import tensorflow as tf
>>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
assert logits[0][0] < logits[0][1] # the next sentence was random
>>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
"""
outputs = self.mobilebert(inputs, **kwargs)
@@ -1078,6 +1050,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs=None,
@@ -1113,19 +1086,6 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFBMobileBertForSequenceClassification
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForSequenceClassification.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -1176,6 +1136,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs=None,
@@ -1217,22 +1178,6 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForQuestionAnswering
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForQuestionAnswering.from_pretrained('mobilebert-uncased') # Not a fine-tuned model! Load a fine-tuned model to obtain coherent results.
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
assert answer == "a nice puppet"
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1298,6 +1243,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs,
@@ -1334,22 +1280,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForMultipleChoice
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
@@ -1438,6 +1368,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs=None,
@@ -1471,19 +1402,6 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForTokenClassification
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels

View File

@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import (
TFConv1D,
TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-gpt",
# See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -449,6 +451,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -466,18 +469,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
@@ -497,6 +488,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -514,18 +506,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]
@@ -601,26 +581,23 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
Examples::
# For example purposes. Not runnable.
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
>>> import tensorflow as tf
>>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
>>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
# Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0
raise NotImplementedError
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices
mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :] # Batch size 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
>>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
>>> model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
>>> print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoding = tokenizer(choices, return_tensors="tf")
>>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
>>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :] # Batch size 1
>>> outputs = model(inputs)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
@@ -633,7 +610,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict):
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)

View File

@@ -21,7 +21,12 @@ import logging
import tensorflow as tf
from .configuration_roberta import RobertaConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
@@ -38,6 +43,8 @@ from .tokenization_utils_base import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"roberta-base",
"roberta-large",
@@ -195,6 +202,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
self.roberta = TFRobertaMainLayer(config, name="roberta")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs):
r"""
Returns:
@@ -219,18 +227,6 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.roberta(inputs, **kwargs)
return outputs
@@ -279,6 +275,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -296,18 +293,6 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMaskedLM
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.roberta(inputs, **kwargs)
@@ -358,6 +343,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
self.classifier = TFRobertaClassificationHead(config, name="classifier")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs=None,
@@ -387,19 +373,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -441,7 +414,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roberta = TFBertMainLayer(config, name="roberta")
self.roberta = TFRobertaMainLayer(config, name="roberta")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
@@ -457,6 +430,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs,
@@ -493,22 +467,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
@@ -592,6 +550,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
)
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs=None,
@@ -625,19 +584,6 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForTokenClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
@@ -687,6 +633,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
)
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs=None,
@@ -728,24 +675,6 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint roberta-base is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions

View File

@@ -37,6 +37,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "T5Tokenizer"
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
"t5-small",
"t5-base",
@@ -931,13 +933,13 @@ class TFT5Model(TFT5PreTrainedModel):
Examples::
from transformers import T5Tokenizer, TFT5Model
>>> from transformers import T5Tokenizer, TFT5Model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5Model.from_pretrained('t5-small')
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
outputs = model(inputs, decoder_input_ids=inputs)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5Model.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> outputs = model(inputs, decoder_input_ids=inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
@@ -1074,18 +1076,18 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
Examples::
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
>>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
outputs = model(inputs, decoder_input_ids=inputs)
prediction_scores = outputs[0]
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> outputs = model(inputs, decoder_input_ids=inputs)
>>> prediction_scores = outputs[0]
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1
model.generate(inputs)
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> result = model.generate(inputs)
"""

View File

@@ -22,7 +22,7 @@ import logging
import tensorflow as tf
from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .modeling_tf_utils import (
TFPreTrainedModel,
@@ -36,6 +36,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl-wt103",
# See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -722,6 +724,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -743,18 +746,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
@@ -811,6 +802,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def call(
self,
inputs,
@@ -842,18 +834,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores, mems = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
@@ -863,7 +843,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
labels = inputs[4] if len(inputs) > 4 else labels
output_attentions = inputs[5] if len(inputs) > 5 else output_attentions
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
elif isinstance(inputs, (BatchEncoding, dict)):
input_ids = inputs.get("input_ids")
mems = inputs.get("mems", mems)
head_mask = inputs.get("head_mask", head_mask)

View File

@@ -24,7 +24,12 @@ import numpy as np
import tensorflow as tf
from .configuration_xlm import XLMConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
@@ -43,6 +48,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLMTokenizer"
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlm-mlm-en-2048",
"xlm-mlm-ende-1024",
@@ -608,6 +615,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
self.transformer = TFXLMMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -625,18 +633,6 @@ class TFXLMModel(TFXLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
@@ -704,6 +700,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
return {"inputs": inputs, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -721,18 +718,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMWithLMHeadModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
transformer_outputs = self.transformer(inputs, **kwargs)
@@ -757,6 +742,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
inputs=None,
@@ -795,19 +781,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForSequenceClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels
@@ -865,6 +838,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
inputs,
@@ -876,9 +850,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
cache=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r"""
@@ -904,22 +878,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForMultipleChoice
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
@@ -932,7 +890,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
head_mask = inputs[7] if len(inputs) > 7 else head_mask
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
assert len(inputs) <= 10, "Too many inputs."
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
labels = inputs[11] if len(inputs) > 11 else labels
assert len(inputs) <= 11, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
@@ -944,7 +904,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 10, "Too many inputs."
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
labels = inputs.get("labels", labels)
assert len(inputs) <= 12, "Too many inputs."
else:
input_ids = inputs
@@ -1001,13 +963,14 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
self.transformer = TFXLMMainLayer(config, name="transformer")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
input_ids=None,
inputs=None,
attention_mask=None,
langs=None,
token_type_ids=None,
@@ -1016,9 +979,9 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
cache=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r"""
@@ -1041,25 +1004,22 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForTokenClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels
if len(inputs) > 11:
inputs = inputs[:11]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
input_ids,
inputs,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
@@ -1072,7 +1032,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
outputs = (logits,) + transformer_outputs[2:] # add hidden states and attention if they are here
outputs = (logits,) + transformer_outputs[1:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, logits)
@@ -1095,6 +1055,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
inputs=None,
@@ -1139,21 +1100,6 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[11] if len(inputs) > 11 else start_positions

View File

@@ -23,7 +23,12 @@ import numpy as np
import tensorflow as tf
from .configuration_xlnet import XLNetConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
@@ -42,6 +47,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLNetTokenizer"
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlnet-base-cased",
"xlnet-large-cased",
@@ -832,6 +839,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
self.transformer = TFXLNetMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(self, inputs, **kwargs):
r"""
Return:
@@ -853,18 +861,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetModel.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
@@ -949,10 +945,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token
perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
@@ -986,6 +985,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
)
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
@@ -1029,19 +1029,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels
@@ -1105,6 +1092,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
@@ -1145,22 +1133,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
@@ -1257,6 +1229,8 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
@@ -1298,19 +1272,6 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForTokenClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels
@@ -1361,6 +1322,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
)
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
@@ -1412,21 +1374,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[12] if len(inputs) > 12 else start_positions

View File

@@ -27,13 +27,15 @@ import torch.nn as nn
import torch.nn.functional as F
from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
from .modeling_utils import PreTrainedModel
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl-wt103",
# See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -749,6 +751,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def forward(
self,
input_ids=None,
@@ -778,18 +781,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import TransfoXLTokenizer, TransfoXLModel
import torch
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -945,6 +936,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def forward(
self,
input_ids=None,
@@ -984,18 +976,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
import torch
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
prediction_scores, mems = outputs[:2]
"""
if input_ids is not None:
bsz, tgt_len = input_ids.size(0), input_ids.size(1)

View File

@@ -978,13 +978,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
Examples::
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
outputs = model.generate(max_length=40) # do greedy decoding
print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache.
input_context = 'The dog'
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
@@ -992,22 +994,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
input_context = 'The dog'
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling
outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # 3 generate sequences using by sampling
for i in range(3): # 3 output sequences were generated
print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('ctrl') # Download model and configuration from S3 and cache.
input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences
print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('gpt2') # Download model and configuration from S3 and cache.
input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl
bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context

View File

@@ -28,7 +28,7 @@ from torch.nn import functional as F
from .activations import gelu
from .configuration_xlm import XLMConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import (
PreTrainedModel,
SequenceSummary,
@@ -40,6 +40,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLMTokenizer"
XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlm-mlm-en-2048",
"xlm-mlm-ende-1024",
@@ -395,6 +397,7 @@ class XLMModel(XLMPreTrainedModel):
self.attentions[layer].prune_heads(heads)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
@@ -425,18 +428,6 @@ class XLMModel(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMModel
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -632,6 +623,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
return {"input_ids": input_ids, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
@@ -672,18 +664,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMWithLMHeadModel
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
transformer_outputs = self.transformer(
input_ids,
@@ -722,6 +702,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
@@ -761,19 +742,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMForSequenceClassification
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
@@ -822,6 +790,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
@@ -867,20 +836,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
"""
transformer_outputs = self.transformer(
input_ids,
@@ -1006,19 +961,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Example::
from transformers import XLMTokenizer, XLMForQuestionAnswering
import torch
>>> from transformers import XLMTokenizer, XLMForQuestionAnswering
>>> import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs[0]
"""
transformer_outputs = self.transformer(
input_ids,
@@ -1067,6 +1023,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
@@ -1074,6 +1031,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
labels=None,
output_attentions=None,
@@ -1101,19 +1060,6 @@ class XLMForTokenClassification(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMForTokenClassification
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.transformer(
input_ids,
@@ -1121,6 +1067,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,

View File

@@ -26,12 +26,14 @@ from torch.nn import functional as F
from .activations import gelu_new, swish
from .configuration_xlnet import XLNetConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLNetTokenizer"
XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlnet-base-cased",
"xlnet-large-cased",
@@ -749,6 +751,7 @@ class XLNetModel(XLNetPreTrainedModel):
return pos_emb
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
@@ -785,20 +788,6 @@ class XLNetModel(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetModel
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetModel.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1164,6 +1153,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
@@ -1208,20 +1198,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
@@ -1273,6 +1249,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
@@ -1316,21 +1293,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForTokenClassification
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
scores = outputs[0]
"""
outputs = self.transformer(
@@ -1386,6 +1348,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
@@ -1431,22 +1394,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForMultipleChoice
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1508,6 +1455,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
@@ -1558,22 +1506,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
"""
outputs = self.transformer(
@@ -1705,20 +1637,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Example::
from transformers import XLNetTokenizer, XLNetForQuestionAnswering
import torch
>>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
>>> import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs[0]
"""
transformer_outputs = self.transformer(
input_ids,

View File

@@ -66,13 +66,15 @@ class MBartTokenizer(XLMRobertaTokenizer):
The tokenization method is <tokens> <eos> <language code>. There is no BOS token.
Examples::
from transformers import MBartTokenizer
tokenizer = MBartTokenizer.from_pretrained('mbart-large-en-ro')
example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
batch: dict = tokenizer.prepare_translation_batch(
example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
)
>>> from transformers import MBartTokenizer
>>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> batch: dict = tokenizer.prepare_translation_batch(
... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
... )
"""
vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}

View File

@@ -25,13 +25,13 @@ class MarianTokenizer(PreTrainedTokenizer):
Examples::
from transformers import MarianTokenizer
tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
# keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask].
# model(**batch) should work
>>> from transformers import MarianTokenizer
>>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
>>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
>>> batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
>>> # keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask].
>>> # model(**batch) should work
"""
vocab_files_names = vocab_files_names

View File

@@ -81,6 +81,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,

View File

@@ -94,6 +94,7 @@ class T5Tokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,

View File

@@ -13,52 +13,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import doctest
import logging
import os
import unittest
from pathlib import Path
from typing import List, Union
import transformers
from .utils import require_tf, require_torch, slow
def get_examples_from_file(file):
examples = []
example = []
example_mode = False
example_indentation = None
for i, line in enumerate(file):
if example_mode:
current_indentation = len(line) - len(line.strip()) - 1
# Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
empty_line = example_indentation == 0 and len(line) == 1
# If we're back to the example indentation or if it's the end of the docstring.
if (current_indentation == example_indentation and not empty_line) or '"""' in line:
# Exit the example mode and add the example to the examples list
example_mode = False
example_indentation = None
examples.append(example)
example = []
else:
# If line is not empty, add it to the current example
if line != "\n":
example.append(line[example_indentation + 4 : -1])
# Detect the example from '::' or 'example::'
if "example::" in line.lower():
example_mode = True
example_indentation = line.lower().find("example::")
elif "examples::" in line.lower():
example_mode = True
example_indentation = line.lower().find("examples::")
# elif "::" in line.lower() and len(line.strip()) == 2:
# example_mode = True
# example_indentation = line.lower().find("::")
examples = ["\n".join(example) for example in examples]
examples = [example for example in examples if "not runnable" not in example.lower()]
return examples
logger = logging.getLogger()
@require_torch
@@ -66,68 +33,81 @@ def get_examples_from_file(file):
@slow
class TestCodeExamples(unittest.TestCase):
def analyze_directory(
self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
self,
directory: Path,
identifier: Union[str, None] = None,
ignore_files: Union[List[str], None] = [],
n_identifier: Union[str, None] = None,
only_modules: bool = True,
):
"""
Runs through the specific directory, looking for the files identified with `identifier`. Executes
the doctests in those files
Args:
directory (:obj:`str`): Directory containing the files
identifier (:obj:`str`): Will parse files containing this
ignore_files (:obj:`List[str]`): List of files to skip
n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
only_modules (:obj:`bool`): Whether to only analyze modules
"""
files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
if identifier is not None:
files = [file for file in files if identifier in file]
if ignore_files is not None:
files = [file for file in files if file not in ignore_files]
if n_identifier is not None:
if isinstance(n_identifier, List):
for n_ in n_identifier:
files = [file for file in files if n_ not in file]
else:
files = [file for file in files if n_identifier not in file]
ignore_files.append("__init__.py")
files = [file for file in files if file not in ignore_files]
for file in files:
# Open all files
print("Testing", file, end=" ")
with open(os.path.join(directory, file)) as f:
# Retrieve examples
examples = get_examples_from_file(f)
joined_examples = []
print("Testing", file)
def execute_example(code_example):
exec(code_example, {})
# Some examples are the continuation of others.
if len(examples) > 0:
joined_examples.append(examples[0])
joined_examples_index = 0
for example in examples[1:]:
# If they contain this line, then they're a continuation of the previous script
if "# Continuation of the previous script" in example:
joined_examples[joined_examples_index] += "\n" + example
# If not, create a new example and increment the index
else:
joined_examples.append(example)
joined_examples_index += 1
print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
# Execute sub tests with every example.
for index, code_example in enumerate(joined_examples):
with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
execute_example(code_example)
def test_configuration_examples(self):
transformers_directory = "src/transformers"
configuration_files = "configuration"
ignore_files = ["configuration_auto.py", "configuration_utils.py"]
self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
def test_main_doc_examples(self):
doc_directory = "docs/source"
ignore_files = ["favicon.ico"]
self.analyze_directory(doc_directory, ignore_files=ignore_files)
if only_modules:
try:
module_identifier = file.split(".")[0]
module_identifier = getattr(transformers, module_identifier)
suite = doctest.DocTestSuite(module_identifier)
result = unittest.TextTestRunner().run(suite)
self.assertIs(len(result.failures), 0)
except AttributeError:
logger.info(f"{module_identifier} is not a module.")
else:
result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
self.assertIs(result.failed, 0)
def test_modeling_examples(self):
transformers_directory = "src/transformers"
modeling_files = "modeling"
files = "modeling"
ignore_files = [
"modeling_auto.py",
"modeling_t5.py",
"modeling_tf_auto.py",
"modeling_utils.py",
"modeling_tf_t5.py",
"modeling_bart.py",
"modeling_tf_utils.py",
"modeling_ctrl.py",
"modeling_tf_ctrl.py",
]
self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
def test_tokenization_examples(self):
transformers_directory = Path("src/transformers")
files = "tokenization"
self.analyze_directory(transformers_directory, identifier=files)
def test_configuration_examples(self):
transformers_directory = Path("src/transformers")
files = "configuration"
self.analyze_directory(transformers_directory, identifier=files)
def test_remaining_examples(self):
transformers_directory = Path("src/transformers")
n_identifiers = ["configuration", "modeling", "tokenization"]
self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
def test_doc_sources(self):
doc_source_directory = Path("docs/source")
ignore_files = ["favicon.ico"]
self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)

View File

@@ -31,6 +31,7 @@ if is_tf_available():
TFXLMWithLMHeadModel,
TFXLMForSequenceClassification,
TFXLMForQuestionAnsweringSimple,
TFXLMForTokenClassification,
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
)
@@ -219,6 +220,26 @@ class TFXLMModelTester:
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
def create_and_check_xlm_for_token_classification(
self,
config,
input_ids,
token_type_ids,
input_lengths,
sequence_labels,
token_labels,
is_impossible_labels,
input_mask,
):
config.num_labels = self.num_labels
model = TFXLMForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
@@ -244,7 +265,14 @@ class TFXLMModelTester:
class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
all_model_classes = (
(TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
# TODO The multiple choice model is missing and should be added.
(
TFXLMModel,
TFXLMWithLMHeadModel,
TFXLMForSequenceClassification,
TFXLMForQuestionAnsweringSimple,
TFXLMForTokenClassification,
)
if is_tf_available()
else ()
)
@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: