Merge branch 'master' into t5
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
function addIcon() {
|
||||
const huggingFaceLogo = "https://huggingface.co/assets/transformers-docs/huggingface_logo.svg";
|
||||
const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
|
||||
const image = document.createElement("img");
|
||||
image.setAttribute("src", huggingFaceLogo);
|
||||
|
||||
@@ -24,10 +24,10 @@ function addCustomFooter() {
|
||||
social.classList.add("footer__Social");
|
||||
|
||||
const imageDetails = [
|
||||
{ link: "https://huggingface.co", imageLink: "https://huggingface.co/assets/transformers-docs/website.svg" },
|
||||
{ link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/twitter.svg" },
|
||||
{ link: "https://github.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/github.svg" },
|
||||
{ link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/assets/transformers-docs/linkedin.svg" }
|
||||
{ link: "https://huggingface.co", imageLink: "https://huggingface.co/landing/assets/transformers-docs/website.svg" },
|
||||
{ link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/twitter.svg" },
|
||||
{ link: "https://github.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/github.svg" },
|
||||
{ link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/landing/assets/transformers-docs/linkedin.svg" }
|
||||
];
|
||||
|
||||
imageDetails.forEach(imageLinks => {
|
||||
|
||||
@@ -26,7 +26,7 @@ author = u'huggingface'
|
||||
# The short X.Y version
|
||||
version = u''
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = u'2.1.1'
|
||||
release = u'2.2.1'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
@@ -47,6 +47,9 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
||||
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
||||
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
|
||||
9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
||||
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
@@ -89,3 +92,5 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
||||
model_doc/roberta
|
||||
model_doc/distilbert
|
||||
model_doc/ctrl
|
||||
model_doc/camembert
|
||||
model_doc/albert
|
||||
|
||||
@@ -24,15 +24,24 @@ pip install [--editable] .
|
||||
|
||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||
|
||||
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
||||
Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||
|
||||
Run all the tests from the root of the cloned repository with the commands:
|
||||
|
||||
```bash
|
||||
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
``` bash
|
||||
python -m pytest -sv ./transformers/tests/
|
||||
python -m pytest -sv ./examples/
|
||||
```
|
||||
|
||||
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||
|
||||
## OpenAI GPT original tokenization workflow
|
||||
|
||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
||||
|
||||
@@ -5,6 +5,7 @@ The ``.optimization`` module provides:
|
||||
|
||||
- an optimizer with weight decay fixed that can be used to fine-tuned models, and
|
||||
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
|
||||
- a gradient accumulation class to accumulate the gradients of multiple batches
|
||||
|
||||
``AdamW``
|
||||
~~~~~~~~~~~~~~~~
|
||||
@@ -12,25 +13,32 @@ The ``.optimization`` module provides:
|
||||
.. autoclass:: transformers.AdamW
|
||||
:members:
|
||||
|
||||
``AdamWeightDecay``
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AdamWeightDecay
|
||||
:members:
|
||||
|
||||
.. autofunction:: transformers.create_optimizer
|
||||
:members:
|
||||
|
||||
Schedules
|
||||
----------------------------------------------------
|
||||
|
||||
Learning Rate Schedules
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. autoclass:: transformers.ConstantLRSchedule
|
||||
:members:
|
||||
.. autofunction:: transformers.get_constant_schedule
|
||||
|
||||
|
||||
.. autoclass:: transformers.WarmupConstantSchedule
|
||||
:members:
|
||||
.. autofunction:: transformers.get_constant_schedule_with_warmup
|
||||
|
||||
.. image:: /imgs/warmup_constant_schedule.png
|
||||
:target: /imgs/warmup_constant_schedule.png
|
||||
:alt:
|
||||
|
||||
|
||||
.. autoclass:: transformers.WarmupCosineSchedule
|
||||
.. autofunction:: transformers.get_cosine_schedule_with_warmup
|
||||
:members:
|
||||
|
||||
.. image:: /imgs/warmup_cosine_schedule.png
|
||||
@@ -38,8 +46,7 @@ Learning Rate Schedules
|
||||
:alt:
|
||||
|
||||
|
||||
.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
|
||||
:members:
|
||||
.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
|
||||
|
||||
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||
:target: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||
@@ -47,9 +54,22 @@ Learning Rate Schedules
|
||||
|
||||
|
||||
|
||||
.. autoclass:: transformers.WarmupLinearSchedule
|
||||
:members:
|
||||
.. autofunction:: transformers.get_linear_schedule_with_warmup
|
||||
|
||||
.. image:: /imgs/warmup_linear_schedule.png
|
||||
:target: /imgs/warmup_linear_schedule.png
|
||||
:alt:
|
||||
|
||||
``Warmup``
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.Warmup
|
||||
:members:
|
||||
|
||||
Gradient Strategies
|
||||
----------------------------------------------------
|
||||
|
||||
``GradientAccumulator``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.GradientAccumulator
|
||||
|
||||
@@ -54,5 +54,100 @@ Additionally, the following method can be used to load values from a data file
|
||||
Example usage
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||
|
||||
|
||||
XNLI
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
|
||||
the quality of cross-lingual text representations.
|
||||
XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment
|
||||
annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
|
||||
|
||||
It was released together with the paper
|
||||
`XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
|
||||
|
||||
This library hosts the processor to load the XNLI data:
|
||||
- :class:`~transformers.data.processors.utils.XnliProcessor`
|
||||
|
||||
Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
|
||||
|
||||
An example using these processors is given in the
|
||||
`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||
`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
|
||||
|
||||
|
||||
SQuAD
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
|
||||
the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
|
||||
`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside
|
||||
the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
|
||||
|
||||
This library hosts a processor for each of the two versions:
|
||||
|
||||
Processors
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Those processors are:
|
||||
- :class:`~transformers.data.processors.utils.SquadV1Processor`
|
||||
- :class:`~transformers.data.processors.utils.SquadV2Processor`
|
||||
|
||||
They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
|
||||
|
||||
.. autoclass:: transformers.data.processors.squad.SquadProcessor
|
||||
:members:
|
||||
|
||||
Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
|
||||
that can be used as model inputs.
|
||||
|
||||
.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
|
||||
|
||||
These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
|
||||
Examples are given below.
|
||||
|
||||
|
||||
Example usage
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Here is an example using the processors as well as the conversion method using data files:
|
||||
|
||||
Example::
|
||||
|
||||
# Loading a V2 processor
|
||||
processor = SquadV2Processor()
|
||||
examples = processor.get_dev_examples(squad_v2_data_dir)
|
||||
|
||||
# Loading a V1 processor
|
||||
processor = SquadV1Processor()
|
||||
examples = processor.get_dev_examples(squad_v1_data_dir)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
|
||||
Using `tensorflow_datasets` is as easy as using a data file:
|
||||
|
||||
Example::
|
||||
|
||||
# tensorflow_datasets only handle Squad V1.
|
||||
tfds_examples = tfds.load("squad")
|
||||
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
|
||||
|
||||
Another example using these processors is given in the
|
||||
`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
|
||||
|
||||
@@ -84,12 +84,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
|
||||
# Parameters:
|
||||
lr = 1e-3
|
||||
max_grad_norm = 1.0
|
||||
num_total_steps = 1000
|
||||
num_training_steps = 1000
|
||||
num_warmup_steps = 100
|
||||
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
|
||||
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
|
||||
|
||||
### Previously BertAdam optimizer was instantiated like this:
|
||||
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
|
||||
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
|
||||
### and used like this:
|
||||
for batch in train_data:
|
||||
loss = model(batch)
|
||||
@@ -98,12 +98,12 @@ for batch in train_data:
|
||||
|
||||
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
||||
### and used like this:
|
||||
for batch in train_data:
|
||||
loss = model(batch)
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
||||
scheduler.step()
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
```
|
||||
|
||||
64
docs/source/model_doc/albert.rst
Normal file
64
docs/source/model_doc/albert.rst
Normal file
@@ -0,0 +1,64 @@
|
||||
ALBERT
|
||||
----------------------------------------------------
|
||||
|
||||
``AlbrtConfig``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertConfig
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertTokenizer``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertModel``
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertModel
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertForMaskedLM``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertForSequenceClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertForQuestionAnswering``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertForQuestionAnswering
|
||||
:members:
|
||||
|
||||
|
||||
``TFAlbertModel``
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFAlbertModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFAlbertForMaskedLM``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFAlbertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``TFAlbertForSequenceClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFAlbertForSequenceClassification
|
||||
:members:
|
||||
50
docs/source/model_doc/camembert.rst
Normal file
50
docs/source/model_doc/camembert.rst
Normal file
@@ -0,0 +1,50 @@
|
||||
CamemBERT
|
||||
----------------------------------------------------
|
||||
|
||||
``CamembertConfig``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertConfig
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertTokenizer``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertModel``
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertModel
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForMaskedLM``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForSequenceClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForMultipleChoice``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForMultipleChoice
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForTokenClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForTokenClassification
|
||||
:members:
|
||||
@@ -73,6 +73,9 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``gpt2-large`` | | 36-layer, 1280-hidden, 20-heads, 774M parameters. |
|
||||
| | | | OpenAI's Large-sized GPT-2 English model |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``gpt2-xl`` | | 48-layer, 1600-hidden, 25-heads, 1558M parameters. |
|
||||
| | | | OpenAI's XL-sized GPT-2 English model |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| Transformer-XL | ``transfo-xl-wt103`` | | 18-layer, 1024-hidden, 16-heads, 257M parameters. |
|
||||
| | | | English model trained on wikitext-103 |
|
||||
@@ -124,6 +127,14 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``roberta-large-mnli`` | | 24-layer, 1024-hidden, 16-heads, 355M parameters |
|
||||
| | | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__. |
|
||||
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``roberta-base-openai-detector`` | | 12-layer, 768-hidden, 12-heads, 125M parameters |
|
||||
| | | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. |
|
||||
| | | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``roberta-large-openai-detector`` | | 24-layer, 1024-hidden, 16-heads, 355M parameters |
|
||||
| | | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. |
|
||||
| | | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| DistilBERT | ``distilbert-base-uncased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
||||
@@ -140,10 +151,54 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``distilbert-base-multilingual-cased`` | | 6-layer, 768-hidden, 12-heads, 134M parameters |
|
||||
| | | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters |
|
||||
| | | | Salesforce's Large-sized CTRL English model |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| CamemBERT | ``camembert-base`` | | 12-layer, 768-hidden, 12-heads, 110M parameters |
|
||||
| | | | CamemBERT using the BERT-base architecture |
|
||||
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||
| | | | ALBERT base model |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||
| | | | ALBERT large model |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||
| | | | ALBERT xlarge model |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||
| | | | ALBERT xxlarge model |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||
| | | | ALBERT base model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||
| | | | ALBERT large model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| T5 | ``t5-small`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
@@ -165,4 +220,5 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
.. <https://huggingface.co/transformers/examples.html>`__
|
||||
|
||||
.. <https://huggingface.co/transformers/examples.html>`__
|
||||
|
||||
@@ -188,3 +188,35 @@ assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
|
||||
```
|
||||
|
||||
Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
|
||||
|
||||
#### Using the past
|
||||
|
||||
GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
|
||||
|
||||
Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):
|
||||
|
||||
```python
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
import torch
|
||||
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||
|
||||
generated = tokenizer.encode("The Manhattan bridge")
|
||||
context = torch.tensor([generated])
|
||||
past = None
|
||||
|
||||
for i in range(100):
|
||||
print(i)
|
||||
output, past = model(context, past=past)
|
||||
token = torch.argmax(output[0, :])
|
||||
|
||||
generated += [token.tolist()]
|
||||
context = token.unsqueeze(0)
|
||||
|
||||
sequence = tokenizer.decode(generated)
|
||||
|
||||
print(sequence)
|
||||
```
|
||||
|
||||
The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
|
||||
@@ -106,7 +106,7 @@ This section explain how you can save and re-load a fine-tuned model (BERT, GPT,
|
||||
There are three types of files you need to save to be able to reload a fine-tuned model:
|
||||
|
||||
|
||||
* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
|
||||
* the model itself which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
|
||||
* the configuration file of the model which is saved as a JSON file, and
|
||||
* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
|
||||
|
||||
|
||||
Reference in New Issue
Block a user