Merge branch 'master' into master
This commit is contained in:
@@ -1,33 +1,77 @@
|
|||||||
version: 2
|
version: 2
|
||||||
jobs:
|
jobs:
|
||||||
build_py3:
|
build_py3_torch_and_tf:
|
||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:3.5
|
- image: circleci/python:3.5
|
||||||
resource_class: xlarge
|
resource_class: xlarge
|
||||||
parallelism: 1
|
parallelism: 1
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
|
- run: sudo pip install torch
|
||||||
|
- run: sudo pip install tensorflow==2.0.0-rc0
|
||||||
- run: sudo pip install --progress-bar off .
|
- run: sudo pip install --progress-bar off .
|
||||||
- run: sudo pip install pytest codecov pytest-cov
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
- run: sudo pip install tensorboardX scikit-learn
|
- run: sudo pip install tensorboardX scikit-learn
|
||||||
- run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
|
- run: codecov
|
||||||
|
build_py3_torch:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.5
|
||||||
|
resource_class: xlarge
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install torch
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
|
- run: sudo pip install tensorboardX scikit-learn
|
||||||
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
- run: python -m pytest -sv ./examples/
|
- run: python -m pytest -sv ./examples/
|
||||||
- run: codecov
|
- run: codecov
|
||||||
build_py2:
|
build_py3_tf:
|
||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.5
|
||||||
|
resource_class: xlarge
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install tensorflow==2.0.0-rc0
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
|
- run: sudo pip install tensorboardX scikit-learn
|
||||||
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
|
- run: codecov
|
||||||
|
build_py2_torch:
|
||||||
|
working_directory: ~/transformers
|
||||||
resource_class: large
|
resource_class: large
|
||||||
parallelism: 1
|
parallelism: 1
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:2.7
|
- image: circleci/python:2.7
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
|
- run: sudo pip install torch
|
||||||
- run: sudo pip install --progress-bar off .
|
- run: sudo pip install --progress-bar off .
|
||||||
- run: sudo pip install pytest codecov pytest-cov
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
- run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
|
- run: codecov
|
||||||
|
build_py2_tf:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
resource_class: large
|
||||||
|
parallelism: 1
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:2.7
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install tensorflow==2.0.0-rc0
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
- run: codecov
|
- run: codecov
|
||||||
deploy_doc:
|
deploy_doc:
|
||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:3.5
|
- image: circleci/python:3.5
|
||||||
steps:
|
steps:
|
||||||
@@ -37,7 +81,6 @@ jobs:
|
|||||||
- checkout
|
- checkout
|
||||||
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||||
- run: sudo pip install --progress-bar off -r requirements.txt
|
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||||
- run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
|
|
||||||
- run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
- run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
||||||
workflow_filters: &workflow_filters
|
workflow_filters: &workflow_filters
|
||||||
filters:
|
filters:
|
||||||
@@ -48,6 +91,9 @@ workflows:
|
|||||||
version: 2
|
version: 2
|
||||||
build_and_test:
|
build_and_test:
|
||||||
jobs:
|
jobs:
|
||||||
- build_py3
|
- build_py3_torch_and_tf
|
||||||
- build_py2
|
- build_py3_torch
|
||||||
|
- build_py3_tf
|
||||||
|
- build_py2_torch
|
||||||
|
- build_py2_tf
|
||||||
- deploy_doc: *workflow_filters
|
- deploy_doc: *workflow_filters
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
[run]
|
[run]
|
||||||
source=pytorch_transformers
|
source=transformers
|
||||||
omit =
|
omit =
|
||||||
# skip convertion scripts from testing for now
|
# skip convertion scripts from testing for now
|
||||||
*/convert_*
|
*/convert_*
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/migration.md
vendored
2
.github/ISSUE_TEMPLATE/migration.md
vendored
@@ -1,6 +1,6 @@
|
|||||||
---
|
---
|
||||||
name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
|
name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
|
||||||
about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
|
about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
|
||||||
---
|
---
|
||||||
|
|
||||||
## 📚 Migration
|
## 📚 Migration
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -130,5 +130,5 @@ runs
|
|||||||
examples/runs
|
examples/runs
|
||||||
|
|
||||||
# data
|
# data
|
||||||
data
|
/data
|
||||||
serialization_dir
|
serialization_dir
|
||||||
246
README.md
246
README.md
@@ -1,47 +1,86 @@
|
|||||||
# 👾 PyTorch-Transformers
|
<p align="center">
|
||||||
|
<br>
|
||||||
|
<img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
|
||||||
|
<br>
|
||||||
|
<p>
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||||
|
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
|
||||||
|
</a>
|
||||||
|
<a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
|
||||||
|
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
|
||||||
|
</a>
|
||||||
|
<a href="https://huggingface.co/transformers/index.html">
|
||||||
|
<img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
|
||||||
|
</a>
|
||||||
|
<a href="https://github.com/huggingface/transformers/releases">
|
||||||
|
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
|
||||||
|
</a>
|
||||||
|
</p>
|
||||||
|
|
||||||
[](https://circleci.com/gh/huggingface/pytorch-transformers)
|
<h3 align="center">
|
||||||
|
<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
|
||||||
|
</h3>
|
||||||
|
|
||||||
PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
|
🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
|
||||||
|
|
||||||
The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
|
### Features
|
||||||
|
|
||||||
1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
- As easy to use as pytorch-transformers
|
||||||
2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
- As powerful and concise as Keras
|
||||||
3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
- High performance on NLU and NLG tasks
|
||||||
4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
- Low barrier to entry for educators and practitioners
|
||||||
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
|
||||||
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
State-of-the-art NLP for everyone
|
||||||
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
- Deep learning researchers
|
||||||
8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
|
- Hands-on practitioners
|
||||||
) by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
- AI/ML/NLP teachers and educators
|
||||||
|
|
||||||
|
Lower compute costs, smaller carbon footprint
|
||||||
|
- Researchers can share trained models instead of always retraining
|
||||||
|
- Practitioners can reduce compute time and production costs
|
||||||
|
- 8 architectures with over 30 pretrained models, some in more than 100 languages
|
||||||
|
|
||||||
|
Choose the right framework for every part of a model's lifetime
|
||||||
|
- Train state-of-the-art models in 3 lines of code
|
||||||
|
- Deep interoperability between TensorFlow 2.0 and PyTorch models
|
||||||
|
- Move a single model between TF2.0/PyTorch frameworks at will
|
||||||
|
- Seamlessly pick the right framework for training, evaluation, production
|
||||||
|
|
||||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
|
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
|-|-|
|
|-|-|
|
||||||
| [Installation](#installation) | How to install the package |
|
| [Installation](#installation) | How to install the package |
|
||||||
|
| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
|
||||||
| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
|
| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
|
||||||
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
||||||
|
| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
|
||||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
|
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||||
| [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |
|
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||||
|
| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.0.0+
|
This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
|
||||||
|
|
||||||
### With pip
|
### With pip
|
||||||
|
|
||||||
PyTorch-Transformers can be installed by pip as follows:
|
First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
|
||||||
|
Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
|
||||||
|
|
||||||
|
When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install pytorch-transformers
|
pip install transformers
|
||||||
```
|
```
|
||||||
|
|
||||||
### From source
|
### From source
|
||||||
|
|
||||||
Clone the repository and run:
|
Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
|
||||||
|
Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
|
||||||
|
|
||||||
|
When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install [--editable] .
|
pip install [--editable] .
|
||||||
@@ -49,14 +88,16 @@ pip install [--editable] .
|
|||||||
|
|
||||||
### Tests
|
### Tests
|
||||||
|
|
||||||
A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
|
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||||
|
|
||||||
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
|
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
||||||
|
|
||||||
You can run the tests from the root of the cloned repository with the commands:
|
You can run the tests from the root of the cloned repository with the commands:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m pytest -sv ./pytorch_transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -66,8 +107,22 @@ You should check out our [`swift-coreml-transformers`](https://github.com/huggin
|
|||||||
|
|
||||||
It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
|
It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
|
||||||
|
|
||||||
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
|
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
|
||||||
or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
|
|
||||||
|
## Model architectures
|
||||||
|
|
||||||
|
🤗 Transformers currently provides 8 NLU/NLG architectures:
|
||||||
|
|
||||||
|
1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
|
2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
|
3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
|
4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
|
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
|
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
|
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
|
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
|
||||||
|
|
||||||
|
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||||
|
|
||||||
## Online demo
|
## Online demo
|
||||||
|
|
||||||
@@ -80,22 +135,25 @@ You can use it to experiment with completions generated by `GPT2Model`, `Transfo
|
|||||||
|
|
||||||
## Quick tour
|
## Quick tour
|
||||||
|
|
||||||
Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
|
Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import *
|
from transformers import *
|
||||||
|
|
||||||
# PyTorch-Transformers has a unified API
|
# Transformers has a unified API
|
||||||
# for 7 transformer architectures and 30 pretrained weights.
|
# for 8 transformer architectures and 30 pretrained weights.
|
||||||
# Model | Tokenizer | Pretrained weights shortcut
|
# Model | Tokenizer | Pretrained weights shortcut
|
||||||
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
||||||
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
|
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
|
||||||
(GPT2Model, GPT2Tokenizer, 'gpt2'),
|
(GPT2Model, GPT2Tokenizer, 'gpt2'),
|
||||||
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
|
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
|
||||||
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
|
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
|
||||||
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
|
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
|
||||||
(RobertaModel, RobertaTokenizer, 'roberta-base')]
|
(DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
|
||||||
|
(RobertaModel, RobertaTokenizer, 'roberta-base')]
|
||||||
|
|
||||||
|
# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
|
||||||
|
|
||||||
# Let's encode some text in a sequence of hidden-states using each model:
|
# Let's encode some text in a sequence of hidden-states using each model:
|
||||||
for model_class, tokenizer_class, pretrained_weights in MODELS:
|
for model_class, tokenizer_class, pretrained_weights in MODELS:
|
||||||
@@ -121,24 +179,71 @@ for model_class in BERT_MODEL_CLASSES:
|
|||||||
# Load pretrained model/tokenizer
|
# Load pretrained model/tokenizer
|
||||||
model = model_class.from_pretrained('bert-base-uncased')
|
model = model_class.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
# Models can return full list of hidden-states & attentions weights at each layer
|
# Models can return full list of hidden-states & attentions weights at each layer
|
||||||
model = model_class.from_pretrained(pretrained_weights,
|
model = model_class.from_pretrained(pretrained_weights,
|
||||||
output_hidden_states=True,
|
output_hidden_states=True,
|
||||||
output_attentions=True)
|
output_attentions=True)
|
||||||
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
|
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
|
||||||
all_hidden_states, all_attentions = model(input_ids)[-2:]
|
all_hidden_states, all_attentions = model(input_ids)[-2:]
|
||||||
|
|
||||||
# Models are compatible with Torchscript
|
# Models are compatible with Torchscript
|
||||||
model = model_class.from_pretrained(pretrained_weights, torchscript=True)
|
model = model_class.from_pretrained(pretrained_weights, torchscript=True)
|
||||||
traced_model = torch.jit.trace(model, (input_ids,))
|
traced_model = torch.jit.trace(model, (input_ids,))
|
||||||
|
|
||||||
# Simple serialization for models and tokenizers
|
# Simple serialization for models and tokenizers
|
||||||
model.save_pretrained('./directory/to/save/') # save
|
model.save_pretrained('./directory/to/save/') # save
|
||||||
model = model_class.from_pretrained('./directory/to/save/') # re-load
|
model = model_class.from_pretrained('./directory/to/save/') # re-load
|
||||||
tokenizer.save_pretrained('./directory/to/save/') # save
|
tokenizer.save_pretrained('./directory/to/save/') # save
|
||||||
tokenizer = tokenizer_class.from_pretrained('./directory/to/save/') # re-load
|
tokenizer = BertTokenizer.from_pretrained('./directory/to/save/') # re-load
|
||||||
|
|
||||||
# SOTA examples for GLUE, SQUAD, text generation...
|
# SOTA examples for GLUE, SQUAD, text generation...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick tour TF 2.0 training and PyTorch interoperability
|
||||||
|
|
||||||
|
Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow_datasets
|
||||||
|
from transformers import *
|
||||||
|
|
||||||
|
# Load dataset, tokenizer, model from pretrained model/vocabulary
|
||||||
|
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||||
|
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
|
||||||
|
data = tensorflow_datasets.load('glue/mrpc')
|
||||||
|
|
||||||
|
# Prepare dataset for GLUE as a tf.data.Dataset instance
|
||||||
|
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
|
||||||
|
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
|
||||||
|
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
|
||||||
|
valid_dataset = valid_dataset.batch(64)
|
||||||
|
|
||||||
|
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||||
|
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
|
||||||
|
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||||
|
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
||||||
|
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
|
||||||
|
|
||||||
|
# Train and evaluate using tf.keras.Model.fit()
|
||||||
|
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
|
||||||
|
validation_data=valid_dataset, validation_steps=7)
|
||||||
|
|
||||||
|
# Load the TensorFlow model in PyTorch for inspection
|
||||||
|
model.save_pretrained('./save/')
|
||||||
|
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
|
||||||
|
|
||||||
|
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
|
||||||
|
sentence_0 = "This research was consistent with his findings."
|
||||||
|
sentence_1 = "His findings were compatible with this research."
|
||||||
|
sentence_2 = "His findings were not compatible with this research."
|
||||||
|
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
|
||||||
|
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
|
||||||
|
|
||||||
|
pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
|
||||||
|
pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
|
||||||
|
print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
|
||||||
|
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quick tour of the fine-tuning/usage scripts
|
## Quick tour of the fine-tuning/usage scripts
|
||||||
@@ -288,7 +393,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s
|
|||||||
### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
|
### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
|
||||||
|
|
||||||
A conditional generation script is also included to generate text from a prompt.
|
A conditional generation script is also included to generate text from a prompt.
|
||||||
The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
|
The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
|
||||||
|
|
||||||
Here is how to run the script with the small version of OpenAI GPT-2 model:
|
Here is how to run the script with the small version of OpenAI GPT-2 model:
|
||||||
|
|
||||||
@@ -299,19 +404,32 @@ python ./examples/run_generation.py \
|
|||||||
--model_name_or_path=gpt2 \
|
--model_name_or_path=gpt2 \
|
||||||
```
|
```
|
||||||
|
|
||||||
## Migrating from pytorch-pretrained-bert to pytorch-transformers
|
## Migrating from pytorch-transformers to transformers
|
||||||
|
|
||||||
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
|
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
|
||||||
|
|
||||||
|
### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
|
||||||
|
|
||||||
|
To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
|
||||||
|
|
||||||
|
If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
|
||||||
|
|
||||||
|
If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
|
||||||
|
|
||||||
|
|
||||||
|
## Migrating from pytorch-pretrained-bert to transformers
|
||||||
|
|
||||||
|
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
|
||||||
|
|
||||||
### Models always output `tuples`
|
### Models always output `tuples`
|
||||||
|
|
||||||
The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
||||||
|
|
||||||
The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
|
The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||||
|
|
||||||
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
||||||
|
|
||||||
Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
|
Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Let's load our model
|
# Let's load our model
|
||||||
@@ -320,11 +438,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
|||||||
# If you used to have this line in pytorch-pretrained-bert:
|
# If you used to have this line in pytorch-pretrained-bert:
|
||||||
loss = model(input_ids, labels=labels)
|
loss = model(input_ids, labels=labels)
|
||||||
|
|
||||||
# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
|
# Now just use this line in transformers to extract the loss from the output tuple:
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss = outputs[0]
|
loss = outputs[0]
|
||||||
|
|
||||||
# In pytorch-transformers you can also have access to the logits:
|
# In transformers you can also have access to the logits:
|
||||||
loss, logits = outputs[:2]
|
loss, logits = outputs[:2]
|
||||||
|
|
||||||
# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
||||||
@@ -333,13 +451,17 @@ outputs = model(input_ids, labels=labels)
|
|||||||
loss, logits, attentions = outputs
|
loss, logits, attentions = outputs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Using hidden states
|
||||||
|
|
||||||
|
By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
|
||||||
|
|
||||||
### Serialization
|
### Serialization
|
||||||
|
|
||||||
Breaking change in the `from_pretrained()`method:
|
Breaking change in the `from_pretrained()` method:
|
||||||
|
|
||||||
1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
|
1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
|
||||||
|
|
||||||
2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
|
2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
|
||||||
|
|
||||||
Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
|
Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
|
||||||
|
|
||||||
@@ -396,7 +518,7 @@ for batch in train_data:
|
|||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
@@ -411,4 +533,4 @@ for batch in train_data:
|
|||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
|
At the moment, there is no paper associated with Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
|
||||||
|
|||||||
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
|
|||||||
|
|
||||||
RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
|
RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
|
||||||
|
|
||||||
RUN pip install pytorch_transformers
|
RUN pip install transformers
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
@@ -34,11 +34,11 @@ pip install recommonmark
|
|||||||
|
|
||||||
## Building the documentation
|
## Building the documentation
|
||||||
|
|
||||||
Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig
|
Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
|
||||||
command to generate it:
|
command to generate it:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ln -s ../../examples/README.md source/examples.md
|
ln -s ../../examples/README.md examples.md
|
||||||
```
|
```
|
||||||
|
|
||||||
Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
|
Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
|
||||||
|
|||||||
@@ -26,4 +26,7 @@ sphinxcontrib-jsmath==1.0.1
|
|||||||
sphinxcontrib-qthelp==1.0.2
|
sphinxcontrib-qthelp==1.0.2
|
||||||
sphinxcontrib-serializinghtml==1.1.3
|
sphinxcontrib-serializinghtml==1.1.3
|
||||||
urllib3==1.25.3
|
urllib3==1.25.3
|
||||||
sphinx-markdown-tables==0.0.9
|
sphinx-markdown-tables==0.0.9
|
||||||
|
numpy==1.17.2
|
||||||
|
tensorflow==2.0.0rc2
|
||||||
|
torch==1.2.0
|
||||||
File diff suppressed because one or more lines are too long
@@ -15,4 +15,4 @@ In order to help this new field develop, we have included a few additional featu
|
|||||||
* accessing all the attention weights for each head of BERT/GPT/GPT-2,
|
* accessing all the attention weights for each head of BERT/GPT/GPT-2,
|
||||||
* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
|
* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
|
||||||
|
|
||||||
To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
|
To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
|
||||||
|
|||||||
@@ -19,14 +19,14 @@ sys.path.insert(0, os.path.abspath('../..'))
|
|||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
project = u'pytorch-transformers'
|
project = u'transformers'
|
||||||
copyright = u'2019, huggingface'
|
copyright = u'2019, huggingface'
|
||||||
author = u'huggingface'
|
author = u'huggingface'
|
||||||
|
|
||||||
# The short X.Y version
|
# The short X.Y version
|
||||||
version = u''
|
version = u''
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = u'1.2.0'
|
release = u'2.0.0'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
@@ -109,7 +109,7 @@ html_static_path = ['_static']
|
|||||||
# -- Options for HTMLHelp output ---------------------------------------------
|
# -- Options for HTMLHelp output ---------------------------------------------
|
||||||
|
|
||||||
# Output file base name for HTML help builder.
|
# Output file base name for HTML help builder.
|
||||||
htmlhelp_basename = 'pytorch-transformersdoc'
|
htmlhelp_basename = 'transformersdoc'
|
||||||
|
|
||||||
|
|
||||||
# -- Options for LaTeX output ------------------------------------------------
|
# -- Options for LaTeX output ------------------------------------------------
|
||||||
@@ -136,7 +136,7 @@ latex_elements = {
|
|||||||
# (source start file, target name, title,
|
# (source start file, target name, title,
|
||||||
# author, documentclass [howto, manual, or own class]).
|
# author, documentclass [howto, manual, or own class]).
|
||||||
latex_documents = [
|
latex_documents = [
|
||||||
(master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
|
(master_doc, 'transformers.tex', u'transformers Documentation',
|
||||||
u'huggingface', 'manual'),
|
u'huggingface', 'manual'),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -146,7 +146,7 @@ latex_documents = [
|
|||||||
# One entry per manual page. List of tuples
|
# One entry per manual page. List of tuples
|
||||||
# (source start file, name, description, authors, manual section).
|
# (source start file, name, description, authors, manual section).
|
||||||
man_pages = [
|
man_pages = [
|
||||||
(master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
|
(master_doc, 'transformers', u'transformers Documentation',
|
||||||
[author], 1)
|
[author], 1)
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -157,8 +157,8 @@ man_pages = [
|
|||||||
# (source start file, target name, title, author,
|
# (source start file, target name, title, author,
|
||||||
# dir menu entry, description, category)
|
# dir menu entry, description, category)
|
||||||
texinfo_documents = [
|
texinfo_documents = [
|
||||||
(master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
|
(master_doc, 'transformers', u'transformers Documentation',
|
||||||
author, 'pytorch-transformers', 'One line description of project.',
|
author, 'transformers', 'One line description of project.',
|
||||||
'Miscellaneous'),
|
'Miscellaneous'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
|
|||||||
BERT
|
BERT
|
||||||
^^^^
|
^^^^
|
||||||
|
|
||||||
You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
|
You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
|
||||||
|
|
||||||
This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
|
This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
|
||||||
|
|
||||||
@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
|
|||||||
|
|
||||||
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
||||||
|
|
||||||
pytorch_transformers bert \
|
transformers bert \
|
||||||
$BERT_BASE_DIR/bert_model.ckpt \
|
$BERT_BASE_DIR/bert_model.ckpt \
|
||||||
$BERT_BASE_DIR/bert_config.json \
|
$BERT_BASE_DIR/bert_config.json \
|
||||||
$BERT_BASE_DIR/pytorch_model.bin
|
$BERT_BASE_DIR/pytorch_model.bin
|
||||||
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
|
|||||||
|
|
||||||
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
||||||
|
|
||||||
pytorch_transformers gpt \
|
transformers gpt \
|
||||||
$OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
$OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
[OPENAI_GPT_CONFIG]
|
[OPENAI_GPT_CONFIG]
|
||||||
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
|
|||||||
|
|
||||||
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
||||||
|
|
||||||
pytorch_transformers gpt2 \
|
transformers gpt2 \
|
||||||
$OPENAI_GPT2_CHECKPOINT_PATH \
|
$OPENAI_GPT2_CHECKPOINT_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
[OPENAI_GPT2_CONFIG]
|
[OPENAI_GPT2_CONFIG]
|
||||||
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
|
|||||||
|
|
||||||
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
||||||
|
|
||||||
pytorch_transformers transfo_xl \
|
transformers transfo_xl \
|
||||||
$TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
$TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
[TRANSFO_XL_CONFIG]
|
[TRANSFO_XL_CONFIG]
|
||||||
@@ -80,7 +80,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
|
|||||||
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
||||||
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
||||||
|
|
||||||
pytorch_transformers xlnet \
|
transformers xlnet \
|
||||||
$TRANSFO_XL_CHECKPOINT_PATH \
|
$TRANSFO_XL_CHECKPOINT_PATH \
|
||||||
$TRANSFO_XL_CONFIG_PATH \
|
$TRANSFO_XL_CONFIG_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
@@ -96,6 +96,6 @@ Here is an example of the conversion process for a pre-trained XLM model:
|
|||||||
|
|
||||||
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
||||||
|
|
||||||
pytorch_transformers xlm \
|
transformers xlm \
|
||||||
$XLM_CHECKPOINT_PATH \
|
$XLM_CHECKPOINT_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
|
|||||||
1
docs/source/examples.md
Symbolic link
1
docs/source/examples.md
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../../examples/README.md
|
||||||
BIN
docs/source/imgs/transformers_logo_name.png
Normal file
BIN
docs/source/imgs/transformers_logo_name.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 8.7 KiB |
@@ -1,9 +1,43 @@
|
|||||||
Pytorch-Transformers
|
Transformers
|
||||||
================================================================================================================================================
|
================================================================================================================================================
|
||||||
|
|
||||||
PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
|
🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
|
||||||
|
(BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
|
||||||
|
(NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
|
||||||
|
|
||||||
The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
|
This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
|
||||||
|
|
||||||
|
Features
|
||||||
|
---------------------------------------------------
|
||||||
|
|
||||||
|
- As easy to use as pytorch-transformers
|
||||||
|
- As powerful and concise as Keras
|
||||||
|
- High performance on NLU and NLG tasks
|
||||||
|
- Low barrier to entry for educators and practitioners
|
||||||
|
|
||||||
|
State-of-the-art NLP for everyone:
|
||||||
|
|
||||||
|
- Deep learning researchers
|
||||||
|
- Hands-on practitioners
|
||||||
|
- AI/ML/NLP teachers and educators
|
||||||
|
|
||||||
|
Lower compute costs, smaller carbon footprint:
|
||||||
|
|
||||||
|
- Researchers can share trained models instead of always retraining
|
||||||
|
- Practitioners can reduce compute time and production costs
|
||||||
|
- 8 architectures with over 30 pretrained models, some in more than 100 languages
|
||||||
|
|
||||||
|
Choose the right framework for every part of a model's lifetime:
|
||||||
|
|
||||||
|
- Train state-of-the-art models in 3 lines of code
|
||||||
|
- Deep interoperability between TensorFlow 2.0 and PyTorch models
|
||||||
|
- Move a single model between TF2.0/PyTorch frameworks at will
|
||||||
|
- Seamlessly pick the right framework for training, evaluation, production
|
||||||
|
|
||||||
|
Contents
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
|
||||||
|
|
||||||
1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
@@ -12,7 +46,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
|||||||
5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
||||||
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
@@ -37,6 +71,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
|||||||
main_classes/model
|
main_classes/model
|
||||||
main_classes/tokenizer
|
main_classes/tokenizer
|
||||||
main_classes/optimizer_schedules
|
main_classes/optimizer_schedules
|
||||||
|
main_classes/processors
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
Installation
|
Installation
|
||||||
================================================
|
================================================
|
||||||
|
|
||||||
PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
|
Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
|
||||||
|
|
||||||
With pip
|
With pip
|
||||||
^^^^^^^^
|
^^^^^^^^
|
||||||
@@ -10,7 +10,7 @@ PyTorch Transformers can be installed using pip as follows:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
pip install pytorch-transformers
|
pip install transformers
|
||||||
|
|
||||||
From source
|
From source
|
||||||
^^^^^^^^^^^
|
^^^^^^^^^^^
|
||||||
@@ -19,15 +19,15 @@ To install from source, clone the repository and install with:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
git clone https://github.com/huggingface/pytorch-transformers.git
|
git clone https://github.com/huggingface/transformers.git
|
||||||
cd pytorch-transformers
|
cd transformers
|
||||||
pip install [--editable] .
|
pip install [--editable] .
|
||||||
|
|
||||||
|
|
||||||
Tests
|
Tests
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
|
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
|
||||||
|
|
||||||
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ Run all the tests from the root of the cloned repository with the commands:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
python -m pytest -sv ./pytorch_transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,5 +6,5 @@ The base class ``PretrainedConfig`` implements the common methods for loading/sa
|
|||||||
``PretrainedConfig``
|
``PretrainedConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.PretrainedConfig
|
.. autoclass:: transformers.PretrainedConfig
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -11,5 +11,11 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
|
|||||||
``PreTrainedModel``
|
``PreTrainedModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.PreTrainedModel
|
.. autoclass:: transformers.PreTrainedModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
``TFPreTrainedModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFPreTrainedModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ The ``.optimization`` module provides:
|
|||||||
``AdamW``
|
``AdamW``
|
||||||
~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AdamW
|
.. autoclass:: transformers.AdamW
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
Schedules
|
Schedules
|
||||||
@@ -18,11 +18,11 @@ Schedules
|
|||||||
Learning Rate Schedules
|
Learning Rate Schedules
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.ConstantLRSchedule
|
.. autoclass:: transformers.ConstantLRSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupConstantSchedule
|
.. autoclass:: transformers.WarmupConstantSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_constant_schedule.png
|
.. image:: /imgs/warmup_constant_schedule.png
|
||||||
@@ -30,7 +30,7 @@ Learning Rate Schedules
|
|||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupCosineSchedule
|
.. autoclass:: transformers.WarmupCosineSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_schedule.png
|
.. image:: /imgs/warmup_cosine_schedule.png
|
||||||
@@ -38,7 +38,7 @@ Learning Rate Schedules
|
|||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
|
.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||||
@@ -47,7 +47,7 @@ Learning Rate Schedules
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupLinearSchedule
|
.. autoclass:: transformers.WarmupLinearSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_linear_schedule.png
|
.. image:: /imgs/warmup_linear_schedule.png
|
||||||
|
|||||||
58
docs/source/main_classes/processors.rst
Normal file
58
docs/source/main_classes/processors.rst
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
Processors
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
This library includes processors for several traditional tasks. These processors can be used to process a dataset into
|
||||||
|
examples that can be fed to a model.
|
||||||
|
|
||||||
|
Processors
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
All processors follow the same architecture which is that of the
|
||||||
|
:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
|
||||||
|
of :class:`~transformers.data.processors.utils.InputExample`. These
|
||||||
|
:class:`~transformers.data.processors.utils.InputExample` can be converted to
|
||||||
|
:class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
|
||||||
|
|
||||||
|
.. autoclass:: transformers.data.processors.utils.DataProcessor
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
.. autoclass:: transformers.data.processors.utils.InputExample
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
.. autoclass:: transformers.data.processors.utils.InputFeatures
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
GLUE
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
|
||||||
|
the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
|
||||||
|
`GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
|
||||||
|
|
||||||
|
This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
|
||||||
|
CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
|
||||||
|
|
||||||
|
Those processors are:
|
||||||
|
- :class:`~transformers.data.processors.utils.MrpcProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.MnliProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.Sst2Processor`
|
||||||
|
- :class:`~transformers.data.processors.utils.StsbProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.QqpProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.QnliProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.RteProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.WnliProcessor`
|
||||||
|
|
||||||
|
Additionally, the following method can be used to load values from a data file and convert them to a list of
|
||||||
|
:class:`~transformers.data.processors.utils.InputExample`.
|
||||||
|
|
||||||
|
.. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
|
||||||
|
|
||||||
|
Example usage
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
An example using these processors is given in the
|
||||||
|
`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||||
@@ -12,5 +12,5 @@ The base class ``PreTrainedTokenizer`` implements the common methods for loading
|
|||||||
``PreTrainedTokenizer``
|
``PreTrainedTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.PreTrainedTokenizer
|
.. autoclass:: transformers.PreTrainedTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
# Migrating from pytorch-pretrained-bert
|
# Migrating from pytorch-pretrained-bert
|
||||||
|
|
||||||
|
|
||||||
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
|
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
|
||||||
|
|
||||||
### Models always output `tuples`
|
### Models always output `tuples`
|
||||||
|
|
||||||
The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
||||||
|
|
||||||
The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
|
The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||||
|
|
||||||
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
||||||
|
|
||||||
Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
|
Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Let's load our model
|
# Let's load our model
|
||||||
@@ -20,11 +20,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
|||||||
# If you used to have this line in pytorch-pretrained-bert:
|
# If you used to have this line in pytorch-pretrained-bert:
|
||||||
loss = model(input_ids, labels=labels)
|
loss = model(input_ids, labels=labels)
|
||||||
|
|
||||||
# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
|
# Now just use this line in transformers to extract the loss from the output tuple:
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss = outputs[0]
|
loss = outputs[0]
|
||||||
|
|
||||||
# In pytorch-transformers you can also have access to the logits:
|
# In transformers you can also have access to the logits:
|
||||||
loss, logits = outputs[:2]
|
loss, logits = outputs[:2]
|
||||||
|
|
||||||
# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
||||||
@@ -96,7 +96,7 @@ for batch in train_data:
|
|||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
|
|||||||
@@ -11,19 +11,19 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
|
|||||||
``AutoConfig``
|
``AutoConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AutoConfig
|
.. autoclass:: transformers.AutoConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``AutoModel``
|
``AutoModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AutoModel
|
.. autoclass:: transformers.AutoModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``AutoTokenizer``
|
``AutoTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AutoTokenizer
|
.. autoclass:: transformers.AutoTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,69 +4,125 @@ BERT
|
|||||||
``BertConfig``
|
``BertConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertConfig
|
.. autoclass:: transformers.BertConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertTokenizer``
|
``BertTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertTokenizer
|
.. autoclass:: transformers.BertTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertModel``
|
``BertModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertModel
|
.. autoclass:: transformers.BertModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForPreTraining``
|
``BertForPreTraining``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForPreTraining
|
.. autoclass:: transformers.BertForPreTraining
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForMaskedLM``
|
``BertForMaskedLM``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForMaskedLM
|
.. autoclass:: transformers.BertForMaskedLM
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForNextSentencePrediction``
|
``BertForNextSentencePrediction``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
|
.. autoclass:: transformers.BertForNextSentencePrediction
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForSequenceClassification``
|
``BertForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForSequenceClassification
|
.. autoclass:: transformers.BertForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForMultipleChoice``
|
``BertForMultipleChoice``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForMultipleChoice
|
.. autoclass:: transformers.BertForMultipleChoice
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForTokenClassification``
|
``BertForTokenClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForTokenClassification
|
.. autoclass:: transformers.BertForTokenClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForQuestionAnswering``
|
``BertForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForQuestionAnswering
|
.. autoclass:: transformers.BertForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForPreTraining``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForPreTraining
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForNextSentencePrediction``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForNextSentencePrediction
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForMultipleChoice``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForMultipleChoice
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForTokenClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForTokenClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForQuestionAnswering``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForQuestionAnswering
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|||||||
@@ -4,40 +4,67 @@ DistilBERT
|
|||||||
``DistilBertConfig``
|
``DistilBertConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertConfig
|
.. autoclass:: transformers.DistilBertConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertTokenizer``
|
``DistilBertTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertTokenizer
|
.. autoclass:: transformers.DistilBertTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertModel``
|
``DistilBertModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertModel
|
.. autoclass:: transformers.DistilBertModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForMaskedLM``
|
``DistilBertForMaskedLM``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
|
.. autoclass:: transformers.DistilBertForMaskedLM
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForSequenceClassification``
|
``DistilBertForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
|
.. autoclass:: transformers.DistilBertForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForQuestionAnswering``
|
``DistilBertForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
|
.. autoclass:: transformers.DistilBertForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
``TFDistilBertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFDistilBertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFDistilBertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFDistilBertForQuestionAnswering``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertForQuestionAnswering
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,33 +4,54 @@ OpenAI GPT
|
|||||||
``OpenAIGPTConfig``
|
``OpenAIGPTConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTConfig
|
.. autoclass:: transformers.OpenAIGPTConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTTokenizer``
|
``OpenAIGPTTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
|
.. autoclass:: transformers.OpenAIGPTTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTModel``
|
``OpenAIGPTModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTModel
|
.. autoclass:: transformers.OpenAIGPTModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTLMHeadModel``
|
``OpenAIGPTLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
|
.. autoclass:: transformers.OpenAIGPTLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTDoubleHeadsModel``
|
``OpenAIGPTDoubleHeadsModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
|
.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFOpenAIGPTModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFOpenAIGPTModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFOpenAIGPTLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFOpenAIGPTLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFOpenAIGPTDoubleHeadsModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,33 +4,54 @@ OpenAI GPT2
|
|||||||
``GPT2Config``
|
``GPT2Config``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2Config
|
.. autoclass:: transformers.GPT2Config
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2Tokenizer``
|
``GPT2Tokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2Tokenizer
|
.. autoclass:: transformers.GPT2Tokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2Model``
|
``GPT2Model``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2Model
|
.. autoclass:: transformers.GPT2Model
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2LMHeadModel``
|
``GPT2LMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2LMHeadModel
|
.. autoclass:: transformers.GPT2LMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2DoubleHeadsModel``
|
``GPT2DoubleHeadsModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
|
.. autoclass:: transformers.GPT2DoubleHeadsModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFGPT2Model``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFGPT2Model
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFGPT2LMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFGPT2LMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFGPT2DoubleHeadsModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFGPT2DoubleHeadsModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,33 +4,54 @@ RoBERTa
|
|||||||
``RobertaConfig``
|
``RobertaConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaConfig
|
.. autoclass:: transformers.RobertaConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaTokenizer``
|
``RobertaTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaTokenizer
|
.. autoclass:: transformers.RobertaTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaModel``
|
``RobertaModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaModel
|
.. autoclass:: transformers.RobertaModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaForMaskedLM``
|
``RobertaForMaskedLM``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaForMaskedLM
|
.. autoclass:: transformers.RobertaForMaskedLM
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaForSequenceClassification``
|
``RobertaForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
|
.. autoclass:: transformers.RobertaForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFRobertaModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFRobertaModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFRobertaForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFRobertaForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFRobertaForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFRobertaForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -5,26 +5,40 @@ Transformer XL
|
|||||||
``TransfoXLConfig``
|
``TransfoXLConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLConfig
|
.. autoclass:: transformers.TransfoXLConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``TransfoXLTokenizer``
|
``TransfoXLTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLTokenizer
|
.. autoclass:: transformers.TransfoXLTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``TransfoXLModel``
|
``TransfoXLModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLModel
|
.. autoclass:: transformers.TransfoXLModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``TransfoXLLMHeadModel``
|
``TransfoXLLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
|
.. autoclass:: transformers.TransfoXLLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFTransfoXLModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFTransfoXLModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFTransfoXLLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFTransfoXLLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,38 +4,66 @@ XLM
|
|||||||
``XLMConfig``
|
``XLMConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMConfig
|
.. autoclass:: transformers.XLMConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
``XLMTokenizer``
|
``XLMTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMTokenizer
|
.. autoclass:: transformers.XLMTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
``XLMModel``
|
``XLMModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMModel
|
.. autoclass:: transformers.XLMModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLMWithLMHeadModel``
|
``XLMWithLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
|
.. autoclass:: transformers.XLMWithLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLMForSequenceClassification``
|
``XLMForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMForSequenceClassification
|
.. autoclass:: transformers.XLMForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLMForQuestionAnswering``
|
``XLMForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
|
.. autoclass:: transformers.XLMForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMWithLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMWithLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMForQuestionAnsweringSimple``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,40 +4,68 @@ XLNet
|
|||||||
``XLNetConfig``
|
``XLNetConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetConfig
|
.. autoclass:: transformers.XLNetConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetTokenizer``
|
``XLNetTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetTokenizer
|
.. autoclass:: transformers.XLNetTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetModel``
|
``XLNetModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetModel
|
.. autoclass:: transformers.XLNetModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetLMHeadModel``
|
``XLNetLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetLMHeadModel
|
.. autoclass:: transformers.XLNetLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetForSequenceClassification``
|
``XLNetForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
|
.. autoclass:: transformers.XLNetForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetForQuestionAnswering``
|
``XLNetForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
|
.. autoclass:: transformers.XLNetForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetForQuestionAnsweringSimple``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -1,16 +1,16 @@
|
|||||||
Notebooks
|
Notebooks
|
||||||
================================================
|
================================================
|
||||||
|
|
||||||
We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
|
We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
|
||||||
|
|
||||||
|
|
||||||
*
|
*
|
||||||
The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
|
The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
|
||||||
|
|
||||||
*
|
*
|
||||||
The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
|
The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
|
||||||
|
|
||||||
*
|
*
|
||||||
The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
|
The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
|
||||||
|
|
||||||
Please follow the instructions given in the notebooks to run and modify them.
|
Please follow the instructions given in the notebooks to run and modify them.
|
||||||
|
|||||||
@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. |
|
| | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. |
|
||||||
| | | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD |
|
| | | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD |
|
||||||
| | | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__). |
|
| | | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__). |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``bert-large-cased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters |
|
| | ``bert-large-cased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters |
|
||||||
| | | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD |
|
| | | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD |
|
||||||
| | | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__) |
|
| | | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``bert-base-cased-finetuned-mrpc`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
| | ``bert-base-cased-finetuned-mrpc`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
| | | | The ``bert-base-cased`` model fine-tuned on MRPC |
|
| | | | The ``bert-base-cased`` model fine-tuned on MRPC |
|
||||||
| | | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__) |
|
| | | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
| | | | OpenAI GPT English model |
|
| | | | OpenAI GPT English model |
|
||||||
@@ -98,6 +98,12 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``xlm-clm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads |
|
| | ``xlm-clm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads |
|
||||||
| | | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia |
|
| | | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``xlm-mlm-17-1280`` | | 16-layer, 1280-hidden, 16-heads |
|
||||||
|
| | | | XLM model trained with MLM (Masked Language Modeling) on 17 languages. |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``xlm-mlm-100-1280`` | | 16-layer, 1280-hidden, 16-heads |
|
||||||
|
| | | | XLM model trained with MLM (Masked Language Modeling) on 100 languages. |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| RoBERTa | ``roberta-base`` | | 12-layer, 768-hidden, 12-heads, 125M parameters |
|
| RoBERTa | ``roberta-base`` | | 12-layer, 768-hidden, 12-heads, 125M parameters |
|
||||||
| | | | RoBERTa using the BERT-base architecture |
|
| | | | RoBERTa using the BERT-base architecture |
|
||||||
@@ -113,11 +119,14 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| DistilBERT | ``distilbert-base-uncased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
| DistilBERT | ``distilbert-base-uncased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
||||||
| | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) |
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``distilbert-base-uncased-distilled-squad`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
| | ``distilbert-base-uncased-distilled-squad`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. |
|
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. |
|
||||||
| | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) |
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``distilgpt2`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||||
|
| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. |
|
||||||
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
.. <https://huggingface.co/transformers/examples.html>`__
|
||||||
.. <https://huggingface.co/pytorch-transformers/examples.html>`__
|
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Philosophy
|
## Philosophy
|
||||||
|
|
||||||
PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
|
Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
|
||||||
|
|
||||||
The library was designed with two strong goals in mind:
|
The library was designed with two strong goals in mind:
|
||||||
|
|
||||||
@@ -19,12 +19,12 @@ The library was designed with two strong goals in mind:
|
|||||||
|
|
||||||
A few other goals:
|
A few other goals:
|
||||||
|
|
||||||
- expose the models internals as consistently as possible:
|
- expose the models' internals as consistently as possible:
|
||||||
|
|
||||||
- we give access, using a single API to the full hidden-states and attention weights,
|
- we give access, using a single API to the full hidden-states and attention weights,
|
||||||
- tokenizer and base model's API are standardized to easily switch between models.
|
- tokenizer and base model's API are standardized to easily switch between models.
|
||||||
|
|
||||||
- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models:
|
- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
|
||||||
|
|
||||||
- a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
|
- a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
|
||||||
- simple ways to mask and prune transformer heads.
|
- simple ways to mask and prune transformer heads.
|
||||||
@@ -33,13 +33,13 @@ A few other goals:
|
|||||||
|
|
||||||
The library is build around three type of classes for each models:
|
The library is build around three type of classes for each models:
|
||||||
|
|
||||||
- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
|
- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
|
||||||
- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
|
- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
|
||||||
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
|
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
|
||||||
|
|
||||||
All these classes can be instantiated from pretrained instances and saved locally using two methods:
|
All these classes can be instantiated from pretrained instances and saved locally using two methods:
|
||||||
|
|
||||||
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
|
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
|
||||||
- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
|
- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
|
||||||
|
|
||||||
We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
|
We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
|
||||||
@@ -51,7 +51,7 @@ We'll finish this quickstart tour by going through a few simple quick-start exam
|
|||||||
|
|
||||||
Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
|
Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
|
||||||
|
|
||||||
See full API reference for examples for each model classe.
|
See full API reference for examples for each model class.
|
||||||
|
|
||||||
### BERT example
|
### BERT example
|
||||||
|
|
||||||
@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
|
from transformers import BertTokenizer, BertModel, BertForMaskedLM
|
||||||
|
|
||||||
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
|
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
|
||||||
import logging
|
import logging
|
||||||
@@ -93,8 +93,8 @@ Let's see how we can use `BertModel` to encode our inputs in hidden-states:
|
|||||||
# Load pre-trained model (weights)
|
# Load pre-trained model (weights)
|
||||||
model = BertModel.from_pretrained('bert-base-uncased')
|
model = BertModel.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
# Set the model in evaluation mode to desactivate the DropOut modules
|
# Set the model in evaluation mode to deactivate the DropOut modules
|
||||||
# This is IMPORTANT to have reproductible results during evaluation!
|
# This is IMPORTANT to have reproducible results during evaluation!
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
# If you have a GPU, put everything on cuda
|
# If you have a GPU, put everything on cuda
|
||||||
@@ -106,7 +106,7 @@ model.to('cuda')
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# See the models docstrings for the detail of the inputs
|
# See the models docstrings for the detail of the inputs
|
||||||
outputs = model(tokens_tensor, token_type_ids=segments_tensors)
|
outputs = model(tokens_tensor, token_type_ids=segments_tensors)
|
||||||
# PyTorch-Transformers models always output tuples.
|
# Transformers models always output tuples.
|
||||||
# See the models docstrings for the detail of all the outputs
|
# See the models docstrings for the detail of all the outputs
|
||||||
# In our case, the first element is the hidden state of the last layer of the Bert model
|
# In our case, the first element is the hidden state of the last layer of the Bert model
|
||||||
encoded_layers = outputs[0]
|
encoded_layers = outputs[0]
|
||||||
@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
|
||||||
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
|
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
|
||||||
import logging
|
import logging
|
||||||
@@ -168,8 +168,8 @@ Let's see how to use `GPT2LMHeadModel` to generate the next token following our
|
|||||||
# Load pre-trained model (weights)
|
# Load pre-trained model (weights)
|
||||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||||
|
|
||||||
# Set the model in evaluation mode to desactivate the DropOut modules
|
# Set the model in evaluation mode to deactivate the DropOut modules
|
||||||
# This is IMPORTANT to have reproductible results during evaluation!
|
# This is IMPORTANT to have reproducible results during evaluation!
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
# If you have a GPU, put everything on cuda
|
# If you have a GPU, put everything on cuda
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ where
|
|||||||
* ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
|
* ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
|
||||||
* ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
|
* ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
|
||||||
|
|
||||||
If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
|
If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
|
||||||
|
|
||||||
*
|
*
|
||||||
``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
|
``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
|
||||||
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
|
|||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
|
from transformers import WEIGHTS_NAME, CONFIG_NAME
|
||||||
|
|
||||||
output_dir = "./models/"
|
output_dir = "./models/"
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
|
|||||||
Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
|
Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
|
||||||
their model to be re-used in other programs, such as efficiency-oriented C++ programs.
|
their model to be re-used in other programs, such as efficiency-oriented C++ programs.
|
||||||
|
|
||||||
We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
|
We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
|
||||||
be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
|
be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
|
||||||
they can be exported, and what to be mindful of when using these models with TorchScript.
|
they can be exported, and what to be mindful of when using these models with TorchScript.
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
|
|||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from pytorch_transformers import BertModel, BertTokenizer, BertConfig
|
from transformers import BertModel, BertTokenizer, BertConfig
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
enc = BertTokenizer.from_pretrained("bert-base-uncased")
|
enc = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ similar API between the different models.
|
|||||||
|
|
||||||
## Language model fine-tuning
|
## Language model fine-tuning
|
||||||
|
|
||||||
Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
|
Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
|
||||||
|
|
||||||
Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT
|
Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT
|
||||||
to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa
|
to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa
|
||||||
@@ -75,7 +75,7 @@ python run_lm_finetuning.py \
|
|||||||
|
|
||||||
## Language generation
|
## Language generation
|
||||||
|
|
||||||
Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
|
Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
|
||||||
|
|
||||||
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
|
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
|
||||||
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
|
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
|
||||||
@@ -91,7 +91,7 @@ python run_generation.py \
|
|||||||
|
|
||||||
## GLUE
|
## GLUE
|
||||||
|
|
||||||
Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
|
Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
|
||||||
|
|
||||||
Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
|
Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
|
||||||
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
|
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
|
||||||
@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606
|
|||||||
|
|
||||||
## SQuAD
|
## SQuAD
|
||||||
|
|
||||||
Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
|
Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
|
||||||
|
|
||||||
#### Fine-tuning on SQuAD
|
#### Fine-tuning on SQuAD
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ import torch
|
|||||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||||
TensorDataset)
|
TensorDataset)
|
||||||
|
|
||||||
from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
||||||
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
||||||
WarmupLinearSchedule)
|
WarmupLinearSchedule)
|
||||||
|
|
||||||
|
|||||||
@@ -35,10 +35,10 @@ from tqdm import tqdm, trange
|
|||||||
|
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForMultipleChoice, BertTokenizer)
|
BertForMultipleChoice, BertTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
# inputs.update({'cls_index': batch[5],
|
# inputs.update({'cls_index': batch[5],
|
||||||
# 'p_mask': batch[6]})
|
# 'p_mask': batch[6]})
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||||
@@ -647,7 +647,7 @@ def main():
|
|||||||
|
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||||
|
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import math
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
|
from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
|||||||
@@ -1,35 +1,40 @@
|
|||||||
# DistilBERT
|
# Distil*
|
||||||
|
|
||||||
This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
|
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2.
|
||||||
|
|
||||||
|
**2019, October 3rd - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2.
|
||||||
|
|
||||||
**2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
|
**2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
|
||||||
|
|
||||||
## What is DistilBERT
|
## What is Distil*
|
||||||
|
|
||||||
DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||||
|
|
||||||
For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
|
We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test of 15.8 compared to 19.3 for DistilGPT2 (after fine-tuning on the train set).
|
||||||
). *Please note that we will publish a formal write-up with updated and more complete results in the near future (September 19th).*
|
|
||||||
|
|
||||||
Here's the updated results on the dev sets of GLUE:
|
For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
|
||||||
|
|
||||||
| Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2 | STS-B | WNLI |
|
Here are the results on the dev sets of GLUE:
|
||||||
|
|
||||||
|
| Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2| STS-B| WNLI |
|
||||||
| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
|
| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
|
||||||
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
||||||
| DistilBERT | **75.2** | 49.1 | 81.8 | 90.2 | 87.0 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||||
|
|
||||||
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
|
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
|
||||||
|
|
||||||
## How to use DistilBERT
|
## How to use DistilBERT
|
||||||
|
|
||||||
PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
||||||
|
|
||||||
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
||||||
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
||||||
|
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset and . The model has 6 layers, 768 dimension and 12 heads, totalizing 82M (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
|
||||||
|
- and more to come! 🤗🤗🤗
|
||||||
|
|
||||||
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
||||||
|
|
||||||
@@ -42,9 +47,11 @@ outputs = model(input_ids)
|
|||||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
```
|
```
|
||||||
|
|
||||||
## How to train DistilBERT
|
Similarly, using DistilGPT2 simply consists in calling the GPT2 classes from a different pretrained checkpoint: `model = GPT2Model.from_pretrained('distilgpt2')`.
|
||||||
|
|
||||||
In the following, we will explain how you can train your own compressed model.
|
## How to train Distil*
|
||||||
|
|
||||||
|
In the following, we will explain how you can train DistilBERT.
|
||||||
|
|
||||||
### A. Preparing the data
|
### A. Preparing the data
|
||||||
|
|
||||||
@@ -57,7 +64,8 @@ First, we will binarize the data, i.e. tokenize the data and convert each token
|
|||||||
```bash
|
```bash
|
||||||
python scripts/binarized_data.py \
|
python scripts/binarized_data.py \
|
||||||
--file_path data/dump.txt \
|
--file_path data/dump.txt \
|
||||||
--bert_tokenizer bert-base-uncased \
|
--tokenizer_type bert \
|
||||||
|
--tokenizer_name bert-base-uncased \
|
||||||
--dump_file data/binarized_text
|
--dump_file data/binarized_text
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -66,7 +74,8 @@ Our implementation of masked language modeling loss follows [XLM](https://github
|
|||||||
```bash
|
```bash
|
||||||
python scripts/token_counts.py \
|
python scripts/token_counts.py \
|
||||||
--data_file data/binarized_text.bert-base-uncased.pickle \
|
--data_file data/binarized_text.bert-base-uncased.pickle \
|
||||||
--token_counts_dump data/token_counts.bert-base-uncased.pickle
|
--token_counts_dump data/token_counts.bert-base-uncased.pickle \
|
||||||
|
--vocab_size 30522
|
||||||
```
|
```
|
||||||
|
|
||||||
### B. Training
|
### B. Training
|
||||||
@@ -75,6 +84,12 @@ Training with distillation is really simple once you have pre-processed the data
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
python train.py \
|
python train.py \
|
||||||
|
--student_type distilbert \
|
||||||
|
--student_config training_configs/distilbert-base-uncased.json \
|
||||||
|
--teacher_type bert \
|
||||||
|
--teacher_name bert-base-uncased \
|
||||||
|
--alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --mlm \
|
||||||
|
--freeze_pos_embs \
|
||||||
--dump_path serialization_dir/my_first_training \
|
--dump_path serialization_dir/my_first_training \
|
||||||
--data_file data/binarized_text.bert-base-uncased.pickle \
|
--data_file data/binarized_text.bert-base-uncased.pickle \
|
||||||
--token_counts data/token_counts.bert-base-uncased.pickle \
|
--token_counts data/token_counts.bert-base-uncased.pickle \
|
||||||
@@ -83,7 +98,7 @@ python train.py \
|
|||||||
|
|
||||||
By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
|
By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
|
||||||
|
|
||||||
We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
|
We highly encourage you to use distributed training for training DistilBERT as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export NODE_RANK=0
|
export NODE_RANK=0
|
||||||
@@ -105,11 +120,17 @@ python -m torch.distributed.launch \
|
|||||||
train.py \
|
train.py \
|
||||||
--force \
|
--force \
|
||||||
--n_gpu $WORLD_SIZE \
|
--n_gpu $WORLD_SIZE \
|
||||||
|
--student_type distilbert \
|
||||||
|
--student_config training_configs/distilbert-base-uncased.json \
|
||||||
|
--teacher_type bert \
|
||||||
|
--teacher_name bert-base-uncased \
|
||||||
|
--alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
|
||||||
|
--freeze_pos_embs \
|
||||||
|
--dump_path serialization_dir/my_first_training \
|
||||||
--data_file data/binarized_text.bert-base-uncased.pickle \
|
--data_file data/binarized_text.bert-base-uncased.pickle \
|
||||||
--token_counts data/token_counts.bert-base-uncased.pickle \
|
--token_counts data/token_counts.bert-base-uncased.pickle
|
||||||
--dump_path serialization_dir/my_first_distillation
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
|
**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
|
||||||
|
|
||||||
Happy distillation!
|
Happy distillation!
|
||||||
|
|||||||
@@ -12,8 +12,8 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" The distiller to distil DistilBERT
|
""" The distiller to distil the student.
|
||||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
@@ -28,16 +28,19 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch.optim import AdamW
|
from torch.optim import AdamW
|
||||||
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
from torch.utils.data import RandomSampler, BatchSampler, DataLoader
|
||||||
|
|
||||||
from pytorch_transformers import WarmupLinearSchedule
|
from transformers import WarmupLinearSchedule
|
||||||
|
|
||||||
from utils import logger
|
from utils import logger
|
||||||
from dataset import Dataset
|
from lm_seqs_dataset import LmSeqsDataset
|
||||||
|
from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
|
||||||
|
|
||||||
class Distiller:
|
class Distiller:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
params: dict,
|
params: dict,
|
||||||
dataloader: Dataset,
|
dataset: LmSeqsDataset,
|
||||||
token_probs: torch.tensor,
|
token_probs: torch.tensor,
|
||||||
student: nn.Module,
|
student: nn.Module,
|
||||||
teacher: nn.Module):
|
teacher: nn.Module):
|
||||||
@@ -50,33 +53,47 @@ class Distiller:
|
|||||||
self.student = student
|
self.student = student
|
||||||
self.teacher = teacher
|
self.teacher = teacher
|
||||||
|
|
||||||
self.dataloader = dataloader
|
self.student_config = student.config
|
||||||
if self.params.n_gpu > 1:
|
self.vocab_size = student.config.vocab_size
|
||||||
self.dataloader.split()
|
|
||||||
self.get_iterator(seed=params.seed)
|
if params.n_gpu <= 1:
|
||||||
|
sampler = RandomSampler(dataset)
|
||||||
|
else:
|
||||||
|
sampler = DistributedSampler(dataset)
|
||||||
|
|
||||||
|
if params.group_by_size:
|
||||||
|
groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
|
||||||
|
sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
|
||||||
|
else:
|
||||||
|
sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
|
||||||
|
|
||||||
|
self.dataloader = DataLoader(dataset=dataset,
|
||||||
|
batch_sampler=sampler,
|
||||||
|
collate_fn=dataset.batch_sequences)
|
||||||
|
|
||||||
self.temperature = params.temperature
|
self.temperature = params.temperature
|
||||||
assert self.temperature > 0.
|
assert self.temperature > 0.
|
||||||
|
|
||||||
self.alpha_ce = params.alpha_ce
|
self.alpha_ce = params.alpha_ce
|
||||||
self.alpha_mlm = params.alpha_mlm
|
self.alpha_mlm = params.alpha_mlm
|
||||||
|
self.alpha_clm = params.alpha_clm
|
||||||
self.alpha_mse = params.alpha_mse
|
self.alpha_mse = params.alpha_mse
|
||||||
self.alpha_cos = params.alpha_cos
|
self.alpha_cos = params.alpha_cos
|
||||||
assert self.alpha_ce >= 0.
|
|
||||||
assert self.alpha_mlm >= 0.
|
|
||||||
assert self.alpha_mse >= 0.
|
|
||||||
assert self.alpha_cos >= 0.
|
|
||||||
assert self.alpha_ce + self.alpha_mlm + self.alpha_mse + self.alpha_cos > 0.
|
|
||||||
|
|
||||||
self.mlm_mask_prop = params.mlm_mask_prop
|
self.mlm = params.mlm
|
||||||
assert 0.0 <= self.mlm_mask_prop <= 1.0
|
if self.mlm:
|
||||||
assert params.word_mask + params.word_keep + params.word_rand == 1.0
|
logger.info(f'Using MLM loss for LM step.')
|
||||||
self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
|
self.mlm_mask_prop = params.mlm_mask_prop
|
||||||
self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
|
assert 0.0 <= self.mlm_mask_prop <= 1.0
|
||||||
self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
|
assert params.word_mask + params.word_keep + params.word_rand == 1.0
|
||||||
if self.fp16:
|
self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
|
||||||
self.pred_probs = self.pred_probs.half()
|
self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
|
||||||
self.token_probs = self.token_probs.half()
|
self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
|
||||||
|
if self.fp16:
|
||||||
|
self.pred_probs = self.pred_probs.half()
|
||||||
|
self.token_probs = self.token_probs.half()
|
||||||
|
else:
|
||||||
|
logger.info(f'Using CLM loss for LM step.')
|
||||||
|
|
||||||
self.epoch = 0
|
self.epoch = 0
|
||||||
self.n_iter = 0
|
self.n_iter = 0
|
||||||
@@ -86,12 +103,13 @@ class Distiller:
|
|||||||
self.last_loss = 0
|
self.last_loss = 0
|
||||||
self.last_loss_ce = 0
|
self.last_loss_ce = 0
|
||||||
self.last_loss_mlm = 0
|
self.last_loss_mlm = 0
|
||||||
|
self.last_loss_clm = 0
|
||||||
if self.alpha_mse > 0.: self.last_loss_mse = 0
|
if self.alpha_mse > 0.: self.last_loss_mse = 0
|
||||||
if self.alpha_cos > 0.: self.last_loss_cos = 0
|
if self.alpha_cos > 0.: self.last_loss_cos = 0
|
||||||
self.last_log = 0
|
self.last_log = 0
|
||||||
|
|
||||||
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
|
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
|
||||||
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
self.mse_loss_fct = nn.MSELoss(reduction='sum')
|
self.mse_loss_fct = nn.MSELoss(reduction='sum')
|
||||||
if self.alpha_cos > 0.:
|
if self.alpha_cos > 0.:
|
||||||
@@ -99,7 +117,7 @@ class Distiller:
|
|||||||
|
|
||||||
logger.info('--- Initializing model optimizer')
|
logger.info('--- Initializing model optimizer')
|
||||||
assert params.gradient_accumulation_steps >= 1
|
assert params.gradient_accumulation_steps >= 1
|
||||||
self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
|
self.num_steps_epoch = len(self.dataloader)
|
||||||
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
|
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
|
||||||
|
|
||||||
no_decay = ['bias', 'LayerNorm.weight']
|
no_decay = ['bias', 'LayerNorm.weight']
|
||||||
@@ -140,43 +158,18 @@ class Distiller:
|
|||||||
logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
|
logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
|
||||||
self.student = DistributedDataParallel(self.student,
|
self.student = DistributedDataParallel(self.student,
|
||||||
device_ids=[params.local_rank],
|
device_ids=[params.local_rank],
|
||||||
output_device=params.local_rank)
|
output_device=params.local_rank,
|
||||||
|
find_unused_parameters=True)
|
||||||
|
|
||||||
self.is_master = params.is_master
|
self.is_master = params.is_master
|
||||||
if self.is_master:
|
if self.is_master:
|
||||||
logger.info('--- Initializing Tensorboard')
|
logger.info('--- Initializing Tensorboard')
|
||||||
self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
|
self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
|
||||||
self.tensorboard.add_text(tag='config', text_string=str(self.params), global_step=0)
|
self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
|
||||||
|
self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
|
||||||
|
|
||||||
def get_iterator(self,
|
def prepare_batch_mlm(self,
|
||||||
seed: int = None):
|
batch):
|
||||||
"""
|
|
||||||
Initialize the data iterator.
|
|
||||||
Each process has its own data iterator (iterating on his own random portion of the dataset).
|
|
||||||
|
|
||||||
Input:
|
|
||||||
------
|
|
||||||
seed: `int` - The random seed.
|
|
||||||
"""
|
|
||||||
logger.info('--- Initializing Data Iterator')
|
|
||||||
self.data_iterator = self.dataloader.get_iterator(seed=seed)
|
|
||||||
|
|
||||||
def get_batch(self):
|
|
||||||
"""
|
|
||||||
Call the data iterator to output a new batch.
|
|
||||||
If the data iterator went through the whole dataset, create a new iterator.
|
|
||||||
"""
|
|
||||||
assert hasattr(self, 'data_iterator')
|
|
||||||
try:
|
|
||||||
x = next(self.data_iterator)
|
|
||||||
except StopIteration:
|
|
||||||
logger.warning('--- Went through the whole dataset. Creating new data iterator.')
|
|
||||||
self.data_iterator = self.dataloader.get_iterator()
|
|
||||||
x = next(self.data_iterator)
|
|
||||||
return x
|
|
||||||
|
|
||||||
def prepare_batch(self,
|
|
||||||
batch):
|
|
||||||
"""
|
"""
|
||||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
|
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
|
||||||
|
|
||||||
@@ -222,7 +215,7 @@ class Distiller:
|
|||||||
assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
|
assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
|
||||||
|
|
||||||
_token_ids_real = token_ids[pred_mask]
|
_token_ids_real = token_ids[pred_mask]
|
||||||
_token_ids_rand = _token_ids_real.clone().random_(self.params.vocab_size)
|
_token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
|
||||||
_token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
|
_token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
|
||||||
probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
|
probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
|
||||||
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
|
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
|
||||||
@@ -230,8 +223,41 @@ class Distiller:
|
|||||||
|
|
||||||
mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
|
mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||||
|
|
||||||
|
# sanity checks
|
||||||
|
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
|
||||||
|
|
||||||
return token_ids, attn_mask, mlm_labels
|
return token_ids, attn_mask, mlm_labels
|
||||||
|
|
||||||
|
def prepare_batch_clm(self,
|
||||||
|
batch):
|
||||||
|
"""
|
||||||
|
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
------
|
||||||
|
batch: `Tuple`
|
||||||
|
token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
|
||||||
|
lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
|
||||||
|
|
||||||
|
Output:
|
||||||
|
-------
|
||||||
|
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||||
|
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||||
|
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
|
||||||
|
"""
|
||||||
|
token_ids, lengths = batch
|
||||||
|
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||||
|
assert token_ids.size(0) == lengths.size(0)
|
||||||
|
|
||||||
|
attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
|
||||||
|
clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
|
||||||
|
clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||||
|
|
||||||
|
# sanity checks
|
||||||
|
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
|
||||||
|
|
||||||
|
return token_ids, attn_mask, clm_labels
|
||||||
|
|
||||||
def round_batch(self,
|
def round_batch(self,
|
||||||
x: torch.tensor,
|
x: torch.tensor,
|
||||||
lengths: torch.tensor):
|
lengths: torch.tensor):
|
||||||
@@ -269,7 +295,10 @@ class Distiller:
|
|||||||
if ml1 % 8 != 0:
|
if ml1 % 8 != 0:
|
||||||
pad = 8 - (ml1 % 8)
|
pad = 8 - (ml1 % 8)
|
||||||
ml2 = ml1 + pad
|
ml2 = ml1 + pad
|
||||||
pad_id = self.params.special_tok_ids['pad_token']
|
if self.mlm:
|
||||||
|
pad_id = self.params.special_tok_ids['pad_token']
|
||||||
|
else:
|
||||||
|
pad_id = self.params.special_tok_ids['unk_token']
|
||||||
padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
|
padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
|
||||||
x = torch.cat([x, padding_tensor], 1)
|
x = torch.cat([x, padding_tensor], 1)
|
||||||
assert x.size() == (bs2, ml2)
|
assert x.size() == (bs2, ml2)
|
||||||
@@ -292,14 +321,16 @@ class Distiller:
|
|||||||
if self.multi_gpu:
|
if self.multi_gpu:
|
||||||
torch.distributed.barrier()
|
torch.distributed.barrier()
|
||||||
|
|
||||||
iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
|
iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
|
||||||
for __ in range(self.num_steps_epoch):
|
for batch in iter_bar:
|
||||||
batch = self.get_batch()
|
|
||||||
if self.params.n_gpu > 0:
|
if self.params.n_gpu > 0:
|
||||||
batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
|
batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
|
||||||
token_ids, attn_mask, mlm_labels = self.prepare_batch(batch=batch)
|
|
||||||
|
|
||||||
self.step(input_ids=token_ids, attention_mask=attn_mask, mlm_labels=mlm_labels)
|
if self.mlm:
|
||||||
|
token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
|
||||||
|
else:
|
||||||
|
token_ids, attn_mask, lm_labels = self.prepare_batch_clm(batch=batch)
|
||||||
|
self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
|
||||||
|
|
||||||
iter_bar.update()
|
iter_bar.update()
|
||||||
iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
|
iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
|
||||||
@@ -317,7 +348,7 @@ class Distiller:
|
|||||||
def step(self,
|
def step(self,
|
||||||
input_ids: torch.tensor,
|
input_ids: torch.tensor,
|
||||||
attention_mask: torch.tensor,
|
attention_mask: torch.tensor,
|
||||||
mlm_labels: torch.tensor):
|
lm_labels: torch.tensor):
|
||||||
"""
|
"""
|
||||||
One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
|
One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
|
||||||
and possibly a parameter update (depending on the gradient accumulation).
|
and possibly a parameter update (depending on the gradient accumulation).
|
||||||
@@ -326,17 +357,22 @@ class Distiller:
|
|||||||
------
|
------
|
||||||
input_ids: `torch.tensor(bs, seq_length)` - The token ids.
|
input_ids: `torch.tensor(bs, seq_length)` - The token ids.
|
||||||
attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
|
attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
|
||||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
|
lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
|
||||||
"""
|
"""
|
||||||
s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
if self.mlm:
|
||||||
with torch.no_grad():
|
s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
||||||
t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
with torch.no_grad():
|
||||||
|
t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
||||||
|
else:
|
||||||
|
s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size)
|
||||||
|
with torch.no_grad():
|
||||||
|
t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size)
|
||||||
assert s_logits.size() == t_logits.size()
|
assert s_logits.size() == t_logits.size()
|
||||||
|
|
||||||
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||||
#https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
#https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
||||||
if self.params.restrict_ce_to_mask:
|
if self.params.restrict_ce_to_mask:
|
||||||
mask = (mlm_labels>-1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||||
else:
|
else:
|
||||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||||
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||||
@@ -348,13 +384,20 @@ class Distiller:
|
|||||||
loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
|
loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
|
||||||
F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
|
F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
|
||||||
loss = self.alpha_ce*loss_ce
|
loss = self.alpha_ce*loss_ce
|
||||||
|
|
||||||
if self.alpha_mlm > 0.:
|
if self.alpha_mlm > 0.:
|
||||||
loss_mlm = self.mlm_loss_fct(s_logits.view(-1, s_logits.size(-1)), mlm_labels.view(-1))
|
loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
|
||||||
loss += self.alpha_mlm * loss_mlm
|
loss += self.alpha_mlm * loss_mlm
|
||||||
|
if self.alpha_clm > 0.:
|
||||||
|
shift_logits = s_logits[..., :-1, :].contiguous()
|
||||||
|
shift_labels = lm_labels[..., 1:].contiguous()
|
||||||
|
loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1))
|
||||||
|
loss += self.alpha_clm * loss_clm
|
||||||
|
|
||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
|
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
|
||||||
loss += self.alpha_mse * loss_mse
|
loss += self.alpha_mse * loss_mse
|
||||||
|
|
||||||
if self.alpha_cos > 0.:
|
if self.alpha_cos > 0.:
|
||||||
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
|
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
|
||||||
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
|
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
|
||||||
@@ -376,6 +419,8 @@ class Distiller:
|
|||||||
self.last_loss_ce = loss_ce.item()
|
self.last_loss_ce = loss_ce.item()
|
||||||
if self.alpha_mlm > 0.:
|
if self.alpha_mlm > 0.:
|
||||||
self.last_loss_mlm = loss_mlm.item()
|
self.last_loss_mlm = loss_mlm.item()
|
||||||
|
if self.alpha_clm > 0.:
|
||||||
|
self.last_loss_clm = loss_clm.item()
|
||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
self.last_loss_mse = loss_mse.item()
|
self.last_loss_mse = loss_mse.item()
|
||||||
if self.alpha_cos > 0.:
|
if self.alpha_cos > 0.:
|
||||||
@@ -452,6 +497,8 @@ class Distiller:
|
|||||||
self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
|
||||||
if self.alpha_mlm > 0.:
|
if self.alpha_mlm > 0.:
|
||||||
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
|
||||||
|
if self.alpha_clm > 0.:
|
||||||
|
self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
|
||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
||||||
if self.alpha_cos > 0.:
|
if self.alpha_cos > 0.:
|
||||||
|
|||||||
105
examples/distillation/grouped_batch_sampler.py
Normal file
105
examples/distillation/grouped_batch_sampler.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
|
||||||
|
"""
|
||||||
|
import bisect
|
||||||
|
import copy
|
||||||
|
from collections import defaultdict
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from torch.utils.data.sampler import BatchSampler, Sampler
|
||||||
|
|
||||||
|
from utils import logger
|
||||||
|
|
||||||
|
def _quantize(x, bins):
|
||||||
|
bins = copy.deepcopy(bins)
|
||||||
|
bins = sorted(bins)
|
||||||
|
quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
|
||||||
|
return quantized
|
||||||
|
|
||||||
|
def create_lengths_groups(lengths, k=0):
|
||||||
|
bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
|
||||||
|
groups = _quantize(lengths, bins)
|
||||||
|
# count number of elements per group
|
||||||
|
counts = np.unique(groups, return_counts=True)[1]
|
||||||
|
fbins = [0] + bins + [np.inf]
|
||||||
|
logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
|
||||||
|
logger.info("Count of instances per bin: {}".format(counts))
|
||||||
|
return groups
|
||||||
|
|
||||||
|
class GroupedBatchSampler(BatchSampler):
|
||||||
|
"""
|
||||||
|
Wraps another sampler to yield a mini-batch of indices.
|
||||||
|
It enforces that the batch only contain elements from the same group.
|
||||||
|
It also tries to provide mini-batches which follows an ordering which is
|
||||||
|
as close as possible to the ordering from the original sampler.
|
||||||
|
Arguments:
|
||||||
|
sampler (Sampler): Base sampler.
|
||||||
|
group_ids (list[int]): If the sampler produces indices in range [0, N),
|
||||||
|
`group_ids` must be a list of `N` ints which contains the group id of each sample.
|
||||||
|
The group ids must be a continuous set of integers starting from
|
||||||
|
0, i.e. they must be in the range [0, num_groups).
|
||||||
|
batch_size (int): Size of mini-batch.
|
||||||
|
"""
|
||||||
|
def __init__(self, sampler, group_ids, batch_size):
|
||||||
|
if not isinstance(sampler, Sampler):
|
||||||
|
raise ValueError(
|
||||||
|
"sampler should be an instance of "
|
||||||
|
"torch.utils.data.Sampler, but got sampler={}".format(sampler)
|
||||||
|
)
|
||||||
|
self.sampler = sampler
|
||||||
|
self.group_ids = group_ids
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
buffer_per_group = defaultdict(list)
|
||||||
|
samples_per_group = defaultdict(list)
|
||||||
|
|
||||||
|
num_batches = 0
|
||||||
|
for idx in self.sampler:
|
||||||
|
group_id = self.group_ids[idx]
|
||||||
|
buffer_per_group[group_id].append(idx)
|
||||||
|
samples_per_group[group_id].append(idx)
|
||||||
|
if len(buffer_per_group[group_id]) == self.batch_size:
|
||||||
|
yield buffer_per_group[group_id] #TODO
|
||||||
|
num_batches += 1
|
||||||
|
del buffer_per_group[group_id]
|
||||||
|
assert len(buffer_per_group[group_id]) < self.batch_size
|
||||||
|
|
||||||
|
# now we have run out of elements that satisfy
|
||||||
|
# the group criteria, let's return the remaining
|
||||||
|
# elements so that the size of the sampler is
|
||||||
|
# deterministic
|
||||||
|
expected_num_batches = len(self)
|
||||||
|
num_remaining = expected_num_batches - num_batches
|
||||||
|
if num_remaining > 0:
|
||||||
|
# for the remaining batches, group the batches by similar lengths
|
||||||
|
batch_idx = []
|
||||||
|
for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
|
||||||
|
batch_idx.extend(idxs)
|
||||||
|
if len(batch_idx) >= self.batch_size:
|
||||||
|
yield batch_idx[:self.batch_size]
|
||||||
|
batch_idx = batch_idx[self.batch_size:]
|
||||||
|
num_remaining -= 1
|
||||||
|
if len(batch_idx) > 0:
|
||||||
|
yield batch_idx
|
||||||
|
num_remaining -= 1
|
||||||
|
assert num_remaining == 0
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""
|
||||||
|
Return the number of mini-batches rather than the number of samples.
|
||||||
|
"""
|
||||||
|
return (len(self.sampler) + self.batch_size - 1) // self.batch_size
|
||||||
@@ -12,30 +12,33 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" Dataloaders to train DistilBERT
|
""" Dataset to distilled models
|
||||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||||
"""
|
"""
|
||||||
from typing import List
|
|
||||||
import math
|
|
||||||
from itertools import chain
|
|
||||||
from collections import Counter
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
import torch
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from utils import logger
|
from utils import logger
|
||||||
|
|
||||||
class Dataset:
|
class LmSeqsDataset(Dataset):
|
||||||
|
"""Custom Dataset wrapping language modeling sequences.
|
||||||
|
|
||||||
|
Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
------
|
||||||
|
params: `NameSpace` parameters
|
||||||
|
data: `List[np.array[int]]
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
params,
|
params,
|
||||||
data):
|
data):
|
||||||
self.params = params
|
self.params = params
|
||||||
self.tokens_per_batch = params.tokens_per_batch
|
|
||||||
self.batch_size = params.batch_size
|
|
||||||
self.shuffle = params.shuffle
|
|
||||||
self.group_by_size = params.group_by_size
|
|
||||||
|
|
||||||
self.token_ids = np.array(data)
|
self.token_ids = np.array(data)
|
||||||
self.lengths = np.uint16([len(t) for t in data])
|
self.lengths = np.array([len(t) for t in data])
|
||||||
|
|
||||||
self.check()
|
self.check()
|
||||||
self.remove_long_sequences()
|
self.remove_long_sequences()
|
||||||
@@ -43,6 +46,9 @@ class Dataset:
|
|||||||
self.check()
|
self.check()
|
||||||
self.print_statistics()
|
self.print_statistics()
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return (self.token_ids[index], self.lengths[index])
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.lengths)
|
return len(self.lengths)
|
||||||
|
|
||||||
@@ -51,12 +57,14 @@ class Dataset:
|
|||||||
Some sanity checks
|
Some sanity checks
|
||||||
"""
|
"""
|
||||||
assert len(self.token_ids) == len(self.lengths)
|
assert len(self.token_ids) == len(self.lengths)
|
||||||
|
assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
|
||||||
|
|
||||||
def remove_long_sequences(self):
|
def remove_long_sequences(self):
|
||||||
"""
|
"""
|
||||||
Sequences that are too long are splitted by chunk of max_position_embeddings.
|
Sequences that are too long are splitted by chunk of max_model_input_size.
|
||||||
"""
|
"""
|
||||||
indices = self.lengths >= self.params.max_position_embeddings
|
max_len = self.params.max_model_input_size
|
||||||
|
indices = self.lengths > max_len
|
||||||
logger.info(f'Splitting {sum(indices)} too long sequences.')
|
logger.info(f'Splitting {sum(indices)} too long sequences.')
|
||||||
|
|
||||||
def divide_chunks(l, n):
|
def divide_chunks(l, n):
|
||||||
@@ -64,10 +72,13 @@ class Dataset:
|
|||||||
|
|
||||||
new_tok_ids = []
|
new_tok_ids = []
|
||||||
new_lengths = []
|
new_lengths = []
|
||||||
cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
|
if self.params.mlm:
|
||||||
max_len = self.params.max_position_embeddings
|
cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
|
||||||
|
else:
|
||||||
|
cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
|
||||||
|
|
||||||
for seq_, len_ in zip(self.token_ids, self.lengths):
|
for seq_, len_ in zip(self.token_ids, self.lengths):
|
||||||
|
assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
|
||||||
if len_ <= max_len:
|
if len_ <= max_len:
|
||||||
new_tok_ids.append(seq_)
|
new_tok_ids.append(seq_)
|
||||||
new_lengths.append(len_)
|
new_lengths.append(len_)
|
||||||
@@ -79,6 +90,7 @@ class Dataset:
|
|||||||
if sub_s[-1] != sep_id:
|
if sub_s[-1] != sep_id:
|
||||||
sub_s = np.insert(sub_s, len(sub_s), sep_id)
|
sub_s = np.insert(sub_s, len(sub_s), sep_id)
|
||||||
assert len(sub_s) <= max_len
|
assert len(sub_s) <= max_len
|
||||||
|
assert (sub_s[0] == cls_id) and (sub_s[-1] == sep_id), sub_s
|
||||||
sub_seqs.append(sub_s)
|
sub_seqs.append(sub_s)
|
||||||
|
|
||||||
new_tok_ids.extend(sub_seqs)
|
new_tok_ids.extend(sub_seqs)
|
||||||
@@ -113,89 +125,27 @@ class Dataset:
|
|||||||
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||||
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
||||||
|
|
||||||
def select_data(self, a: int, b: int):
|
|
||||||
"""
|
|
||||||
Select a subportion of the data.
|
|
||||||
"""
|
|
||||||
n_sequences = len(self)
|
|
||||||
assert 0 <= a < b <= n_sequences, ValueError(f'`0 <= a < b <= n_sequences` is not met with a={a} and b={b}')
|
|
||||||
|
|
||||||
logger.info(f'Selecting sequences from {a} to {b} (excluded).')
|
|
||||||
self.token_ids = self.token_ids[a:b]
|
|
||||||
self.lengths = self.lengths[a:b]
|
|
||||||
|
|
||||||
self.check()
|
|
||||||
|
|
||||||
def split(self):
|
|
||||||
"""
|
|
||||||
Distributed training: split the data accross the processes.
|
|
||||||
"""
|
|
||||||
assert self.params.n_gpu > 1
|
|
||||||
logger.info('Splitting the data accross the processuses.')
|
|
||||||
n_seq = len(self)
|
|
||||||
n_seq_per_procesus = n_seq // self.params.world_size
|
|
||||||
a = n_seq_per_procesus * self.params.global_rank
|
|
||||||
b = a + n_seq_per_procesus
|
|
||||||
self.select_data(a=a, b=b)
|
|
||||||
|
|
||||||
def batch_sequences(self,
|
def batch_sequences(self,
|
||||||
token_ids: List[List[int]],
|
batch):
|
||||||
lengths: List[int]):
|
|
||||||
"""
|
"""
|
||||||
Do the padding and transform into torch.tensor.
|
Do the padding and transform into torch.tensor.
|
||||||
"""
|
"""
|
||||||
|
token_ids = [t[0] for t in batch]
|
||||||
|
lengths = [t[1] for t in batch]
|
||||||
assert len(token_ids) == len(lengths)
|
assert len(token_ids) == len(lengths)
|
||||||
|
|
||||||
# Max for paddings
|
# Max for paddings
|
||||||
max_seq_len_ = max(lengths)
|
max_seq_len_ = max(lengths)
|
||||||
|
|
||||||
# Pad token ids
|
# Pad token ids
|
||||||
pad_idx = self.params.special_tok_ids['pad_token']
|
if self.params.mlm:
|
||||||
|
pad_idx = self.params.special_tok_ids['pad_token']
|
||||||
|
else:
|
||||||
|
pad_idx = self.params.special_tok_ids['unk_token']
|
||||||
tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
|
tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
|
||||||
assert len(tk_) == len(token_ids)
|
assert len(tk_) == len(token_ids)
|
||||||
assert all(len(t) == max_seq_len_ for t in tk_)
|
assert all(len(t) == max_seq_len_ for t in tk_)
|
||||||
|
|
||||||
tk_t = torch.tensor(tk_) # (bs, max_seq_len_)
|
tk_t = torch.tensor(tk_) # (bs, max_seq_len_)
|
||||||
lg_t = torch.tensor(lengths.astype(int)) # (bs)
|
lg_t = torch.tensor(lengths) # (bs)
|
||||||
return tk_t, lg_t
|
return tk_t, lg_t
|
||||||
|
|
||||||
def get_batches_iterator(self,
|
|
||||||
batches):
|
|
||||||
"""
|
|
||||||
Return an iterator over batches.
|
|
||||||
"""
|
|
||||||
for sequences_ids in batches:
|
|
||||||
token_ids, lengths = self.batch_sequences(self.token_ids[sequences_ids],
|
|
||||||
self.lengths[sequences_ids])
|
|
||||||
yield (token_ids, lengths)
|
|
||||||
|
|
||||||
def get_iterator(self,
|
|
||||||
seed: int = None):
|
|
||||||
"""
|
|
||||||
Return a data iterator.
|
|
||||||
"""
|
|
||||||
rng = np.random.RandomState(seed)
|
|
||||||
|
|
||||||
n_sequences = len(self)
|
|
||||||
indices = np.arange(n_sequences)
|
|
||||||
|
|
||||||
if self.group_by_size:
|
|
||||||
indices = indices[np.argsort(self.lengths[indices], kind='mergesort')]
|
|
||||||
|
|
||||||
if self.tokens_per_batch == -1:
|
|
||||||
batches = np.array_split(indices, math.ceil(len(indices) * 1. / self.batch_size))
|
|
||||||
else:
|
|
||||||
assert self.tokens_per_batch > 0
|
|
||||||
batch_ids = np.cumsum(self.lengths[indices]) // self.tokens_per_batch
|
|
||||||
_, bounds = np.unique(batch_ids, return_index=True)
|
|
||||||
batches = [indices[bounds[i]:bounds[i + 1]] for i in range(len(bounds) - 1)]
|
|
||||||
if bounds[-1] < len(indices):
|
|
||||||
batches.append(indices[bounds[-1]:])
|
|
||||||
|
|
||||||
if self.shuffle:
|
|
||||||
rng.shuffle(batches)
|
|
||||||
|
|
||||||
assert n_sequences == sum([len(x) for x in batches])
|
|
||||||
assert self.lengths[indices].sum() == sum([self.lengths[x].sum() for x in batches])
|
|
||||||
|
|
||||||
return self.get_batches_iterator(batches=batches)
|
|
||||||
@@ -3,4 +3,4 @@ tensorboard>=1.14.0
|
|||||||
tensorboardX==1.8
|
tensorboardX==1.8
|
||||||
psutil==5.6.3
|
psutil==5.6.3
|
||||||
scipy==1.3.1
|
scipy==1.3.1
|
||||||
pytorch_transformers==1.2.0
|
transformers==2.0.0
|
||||||
|
|||||||
@@ -13,14 +13,14 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Preprocessing script before training DistilBERT.
|
Preprocessing script before distillation.
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
import pickle
|
import pickle
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pytorch_transformers import BertTokenizer, RobertaTokenizer
|
from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
@@ -32,7 +32,7 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
||||||
parser.add_argument('--file_path', type=str, default='data/dump.txt',
|
parser.add_argument('--file_path', type=str, default='data/dump.txt',
|
||||||
help='The path to the data.')
|
help='The path to the data.')
|
||||||
parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta'])
|
parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
|
||||||
parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
|
parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
|
||||||
help="The tokenizer to use.")
|
help="The tokenizer to use.")
|
||||||
parser.add_argument('--dump_file', type=str, default='data/dump',
|
parser.add_argument('--dump_file', type=str, default='data/dump',
|
||||||
@@ -43,10 +43,16 @@ def main():
|
|||||||
logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
|
logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
|
||||||
if args.tokenizer_type == 'bert':
|
if args.tokenizer_type == 'bert':
|
||||||
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
|
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
|
||||||
|
bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
|
||||||
|
sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
|
||||||
elif args.tokenizer_type == 'roberta':
|
elif args.tokenizer_type == 'roberta':
|
||||||
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
|
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
|
||||||
bos = tokenizer.special_tokens_map['bos_token'] # `[CLS]` for bert, `<s>` for roberta
|
bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
|
||||||
sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` for bert, `</s>` for roberta
|
sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
|
||||||
|
elif args.tokenizer_type == 'gpt2':
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
|
||||||
|
bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
|
||||||
|
sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`
|
||||||
|
|
||||||
logger.info(f'Loading text from {args.file_path}')
|
logger.info(f'Loading text from {args.file_path}')
|
||||||
with open(args.file_path, 'r', encoding='utf8') as fp:
|
with open(args.file_path, 'r', encoding='utf8') as fp:
|
||||||
|
|||||||
89
examples/distillation/scripts/extract.py
Normal file
89
examples/distillation/scripts/extract.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Preprocessing script before training the distilled model.
|
||||||
|
Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
|
||||||
|
"""
|
||||||
|
from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
|
||||||
|
import torch
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
|
||||||
|
parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
|
||||||
|
parser.add_argument("--model_name", default='roberta-large', type=str)
|
||||||
|
parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
|
||||||
|
parser.add_argument("--vocab_transform", action='store_true')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if args.model_type == 'roberta':
|
||||||
|
model = RobertaForMaskedLM.from_pretrained(args.model_name)
|
||||||
|
prefix = 'roberta'
|
||||||
|
elif args.model_type == 'gpt2':
|
||||||
|
model = GPT2LMHeadModel.from_pretrained(args.model_name)
|
||||||
|
prefix = 'transformer'
|
||||||
|
|
||||||
|
state_dict = model.state_dict()
|
||||||
|
compressed_sd = {}
|
||||||
|
|
||||||
|
### Embeddings ###
|
||||||
|
if args.model_type == 'gpt2':
|
||||||
|
for param_name in ['wte.weight', 'wpe.weight']:
|
||||||
|
compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
|
||||||
|
else:
|
||||||
|
for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
|
||||||
|
param_name = f'{prefix}.embeddings.{w}.weight'
|
||||||
|
compressed_sd[param_name] = state_dict[param_name]
|
||||||
|
for w in ['weight', 'bias']:
|
||||||
|
param_name = f'{prefix}.embeddings.LayerNorm.{w}'
|
||||||
|
compressed_sd[param_name] = state_dict[param_name]
|
||||||
|
|
||||||
|
### Transformer Blocks ###
|
||||||
|
std_idx = 0
|
||||||
|
for teacher_idx in [0, 2, 4, 7, 9, 11]:
|
||||||
|
if args.model_type == 'gpt2':
|
||||||
|
for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
|
||||||
|
for w in ['weight', 'bias']:
|
||||||
|
compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
|
||||||
|
state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
|
||||||
|
compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
|
||||||
|
else:
|
||||||
|
for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
|
||||||
|
'attention.output.dense', 'attention.output.LayerNorm',
|
||||||
|
'intermediate.dense', 'output.dense', 'output.LayerNorm']:
|
||||||
|
for w in ['weight', 'bias']:
|
||||||
|
compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
|
||||||
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
|
||||||
|
std_idx += 1
|
||||||
|
|
||||||
|
### Language Modeling Head ###s
|
||||||
|
if args.model_type == 'roberta':
|
||||||
|
for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
|
||||||
|
compressed_sd[f'{layer}'] = state_dict[f'{layer}']
|
||||||
|
if args.vocab_transform:
|
||||||
|
for w in ['weight', 'bias']:
|
||||||
|
compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
|
||||||
|
compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
|
||||||
|
elif args.model_type == 'gpt2':
|
||||||
|
for w in ['weight', 'bias']:
|
||||||
|
compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
|
||||||
|
compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
|
||||||
|
|
||||||
|
print(f'N layers selected for distillation: {std_idx}')
|
||||||
|
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
||||||
|
|
||||||
|
print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
|
||||||
|
torch.save(compressed_sd, args.dump_checkpoint)
|
||||||
@@ -14,14 +14,15 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Preprocessing script before training DistilBERT.
|
Preprocessing script before training DistilBERT.
|
||||||
|
Specific to BERT -> DistilBERT.
|
||||||
"""
|
"""
|
||||||
from pytorch_transformers import BertForMaskedLM, RobertaForMaskedLM
|
from transformers import BertForMaskedLM, RobertaForMaskedLM
|
||||||
import torch
|
import torch
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
|
parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
|
||||||
parser.add_argument("--model_type", default="bert", choices=["bert", "roberta"])
|
parser.add_argument("--model_type", default="bert", choices=["bert"])
|
||||||
parser.add_argument("--model_name", default='bert-base-uncased', type=str)
|
parser.add_argument("--model_name", default='bert-base-uncased', type=str)
|
||||||
parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
|
parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
|
||||||
parser.add_argument("--vocab_transform", action='store_true')
|
parser.add_argument("--vocab_transform", action='store_true')
|
||||||
@@ -31,9 +32,8 @@ if __name__ == '__main__':
|
|||||||
if args.model_type == 'bert':
|
if args.model_type == 'bert':
|
||||||
model = BertForMaskedLM.from_pretrained(args.model_name)
|
model = BertForMaskedLM.from_pretrained(args.model_name)
|
||||||
prefix = 'bert'
|
prefix = 'bert'
|
||||||
elif args.model_type == 'roberta':
|
else:
|
||||||
model = RobertaForMaskedLM.from_pretrained(args.model_name)
|
raise ValueError(f'args.model_type should be "bert".')
|
||||||
prefix = 'roberta'
|
|
||||||
|
|
||||||
state_dict = model.state_dict()
|
state_dict = model.state_dict()
|
||||||
compressed_sd = {}
|
compressed_sd = {}
|
||||||
@@ -68,20 +68,12 @@ if __name__ == '__main__':
|
|||||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
|
||||||
std_idx += 1
|
std_idx += 1
|
||||||
|
|
||||||
if args.model_type == 'bert':
|
compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
|
||||||
compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
|
compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
|
||||||
compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
|
if args.vocab_transform:
|
||||||
if args.vocab_transform:
|
for w in ['weight', 'bias']:
|
||||||
for w in ['weight', 'bias']:
|
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
|
||||||
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
|
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
|
||||||
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
|
|
||||||
elif args.model_type == 'roberta':
|
|
||||||
compressed_sd[f'vocab_projector.weight'] = state_dict[f'lm_head.decoder.weight']
|
|
||||||
compressed_sd[f'vocab_projector.bias'] = state_dict[f'lm_head.bias']
|
|
||||||
if args.vocab_transform:
|
|
||||||
for w in ['weight', 'bias']:
|
|
||||||
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'lm_head.dense.{w}']
|
|
||||||
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
|
|
||||||
|
|
||||||
print(f'N layers selected for distillation: {std_idx}')
|
print(f'N layers selected for distillation: {std_idx}')
|
||||||
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
||||||
@@ -13,7 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Preprocessing script before training DistilBERT.
|
Preprocessing script before training the distilled model.
|
||||||
"""
|
"""
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@@ -13,7 +13,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Training DistilBERT.
|
Training the distilled model.
|
||||||
|
Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
@@ -23,68 +24,96 @@ import shutil
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
|
from transformers import BertConfig, BertForMaskedLM, BertTokenizer
|
||||||
from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
|
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
|
||||||
|
from transformers import DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer
|
||||||
|
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
|
||||||
|
|
||||||
from distiller import Distiller
|
from distiller import Distiller
|
||||||
from utils import git_log, logger, init_gpu_params, set_seed
|
from utils import git_log, logger, init_gpu_params, set_seed
|
||||||
from dataset import Dataset
|
from lm_seqs_dataset import LmSeqsDataset
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_CLASSES = {
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||||
|
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||||
|
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||||
|
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
def sanity_checks(args):
|
||||||
|
"""
|
||||||
|
A bunch of args sanity checks to perform even starting...
|
||||||
|
"""
|
||||||
|
assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
|
||||||
|
assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
|
||||||
|
if args.mlm:
|
||||||
|
assert os.path.isfile(args.token_counts)
|
||||||
|
assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
|
||||||
|
else:
|
||||||
|
assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
|
||||||
|
|
||||||
|
assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
|
||||||
|
assert os.path.isfile(args.student_config)
|
||||||
|
if args.student_pretrained_weights is not None:
|
||||||
|
assert os.path.isfile(args.student_pretrained_weights)
|
||||||
|
|
||||||
|
if args.freeze_token_type_embds: assert args.student_type in ['roberta']
|
||||||
|
|
||||||
|
assert args.alpha_ce >= 0.
|
||||||
|
assert args.alpha_mlm >= 0.
|
||||||
|
assert args.alpha_clm >= 0.
|
||||||
|
assert args.alpha_mse >= 0.
|
||||||
|
assert args.alpha_cos >= 0.
|
||||||
|
assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.
|
||||||
|
|
||||||
|
def freeze_pos_embeddings(student, args):
|
||||||
|
if args.student_type == 'roberta':
|
||||||
|
student.roberta.embeddings.position_embeddings.weight.requires_grad = False
|
||||||
|
elif args.student_type == 'gpt2':
|
||||||
|
student.transformer.wpe.weight.requires_grad = False
|
||||||
|
|
||||||
|
def freeze_token_type_embeddings(student, args):
|
||||||
|
if args.student_type == 'roberta':
|
||||||
|
student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Training")
|
parser = argparse.ArgumentParser(description="Training")
|
||||||
|
parser.add_argument("--force", action='store_true',
|
||||||
|
help="Overwrite dump_path if it already exists.")
|
||||||
|
|
||||||
parser.add_argument("--dump_path", type=str, required=True,
|
parser.add_argument("--dump_path", type=str, required=True,
|
||||||
help="The output directory (log, checkpoints, parameters, etc.)")
|
help="The output directory (log, checkpoints, parameters, etc.)")
|
||||||
parser.add_argument("--data_file", type=str, required=True,
|
parser.add_argument("--data_file", type=str, required=True,
|
||||||
help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
|
help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
|
||||||
parser.add_argument("--token_counts", type=str, required=True,
|
|
||||||
help="The token counts in the data_file for MLM.")
|
|
||||||
parser.add_argument("--force", action='store_true',
|
|
||||||
help="Overwrite dump_path if it already exists.")
|
|
||||||
|
|
||||||
parser.add_argument("--vocab_size", default=30522, type=int,
|
parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
|
||||||
help="The vocabulary size.")
|
help="The student type (DistilBERT, RoBERTa).")
|
||||||
parser.add_argument("--max_position_embeddings", default=512, type=int,
|
parser.add_argument("--student_config", type=str, required=True,
|
||||||
help="Maximum sequence length we can model (including [CLS] and [SEP]).")
|
help="Path to the student configuration.")
|
||||||
parser.add_argument("--sinusoidal_pos_embds", action='store_false',
|
parser.add_argument("--student_pretrained_weights", default=None, type=str,
|
||||||
help="If true, the position embeddings are simply fixed with sinusoidal embeddings.")
|
|
||||||
parser.add_argument("--n_layers", default=6, type=int,
|
|
||||||
help="Number of Transformer blocks.")
|
|
||||||
parser.add_argument("--n_heads", default=12, type=int,
|
|
||||||
help="Number of heads in the self-attention module.")
|
|
||||||
parser.add_argument("--dim", default=768, type=int,
|
|
||||||
help="Dimension through the network. Must be divisible by n_heads")
|
|
||||||
parser.add_argument("--hidden_dim", default=3072, type=int,
|
|
||||||
help="Intermediate dimension in the FFN.")
|
|
||||||
parser.add_argument("--dropout", default=0.1, type=float,
|
|
||||||
help="Dropout.")
|
|
||||||
parser.add_argument("--attention_dropout", default=0.1, type=float,
|
|
||||||
help="Dropout in self-attention.")
|
|
||||||
parser.add_argument("--activation", default='gelu', type=str,
|
|
||||||
help="Activation to use in self-attention")
|
|
||||||
parser.add_argument("--tie_weights_", action='store_false',
|
|
||||||
help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.")
|
|
||||||
|
|
||||||
parser.add_argument("--from_pretrained_weights", default=None, type=str,
|
|
||||||
help="Load student initialization checkpoint.")
|
help="Load student initialization checkpoint.")
|
||||||
parser.add_argument("--from_pretrained_config", default=None, type=str,
|
|
||||||
help="Load student initialization architecture config.")
|
parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
|
||||||
parser.add_argument("--teacher_type", default="bert", choices=["bert", "roberta"],
|
|
||||||
help="Teacher type (BERT, RoBERTa).")
|
help="Teacher type (BERT, RoBERTa).")
|
||||||
parser.add_argument("--teacher_name", default="bert-base-uncased", type=str,
|
parser.add_argument("--teacher_name", type=str, required=True,
|
||||||
help="The teacher model.")
|
help="The teacher model.")
|
||||||
|
|
||||||
parser.add_argument("--temperature", default=2., type=float,
|
parser.add_argument("--temperature", default=2., type=float,
|
||||||
help="Temperature for the softmax temperature.")
|
help="Temperature for the softmax temperature.")
|
||||||
parser.add_argument("--alpha_ce", default=0.5, type=float,
|
parser.add_argument("--alpha_ce", default=0.5, type=float,
|
||||||
help="Linear weight for the distillation loss. Must be >=0.")
|
help="Linear weight for the distillation loss. Must be >=0.")
|
||||||
parser.add_argument("--alpha_mlm", default=0.5, type=float,
|
parser.add_argument("--alpha_mlm", default=0.0, type=float,
|
||||||
help="Linear weight for the MLM loss. Must be >=0.")
|
help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
|
||||||
|
parser.add_argument("--alpha_clm", default=0.5, type=float,
|
||||||
|
help="Linear weight for the CLM loss. Must be >=0.")
|
||||||
parser.add_argument("--alpha_mse", default=0.0, type=float,
|
parser.add_argument("--alpha_mse", default=0.0, type=float,
|
||||||
help="Linear weight of the MSE loss. Must be >=0.")
|
help="Linear weight of the MSE loss. Must be >=0.")
|
||||||
parser.add_argument("--alpha_cos", default=0.0, type=float,
|
parser.add_argument("--alpha_cos", default=0.0, type=float,
|
||||||
help="Linear weight of the cosine embedding loss. Must be >=0.")
|
help="Linear weight of the cosine embedding loss. Must be >=0.")
|
||||||
|
|
||||||
|
parser.add_argument("--mlm", action="store_true",
|
||||||
|
help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
|
||||||
parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
|
parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
|
||||||
help="Proportion of tokens for which we need to make a prediction.")
|
help="Proportion of tokens for which we need to make a prediction.")
|
||||||
parser.add_argument("--word_mask", default=0.8, type=float,
|
parser.add_argument("--word_mask", default=0.8, type=float,
|
||||||
@@ -95,17 +124,20 @@ def main():
|
|||||||
help="Proportion of tokens to randomly replace.")
|
help="Proportion of tokens to randomly replace.")
|
||||||
parser.add_argument("--mlm_smoothing", default=0.7, type=float,
|
parser.add_argument("--mlm_smoothing", default=0.7, type=float,
|
||||||
help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
|
help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
|
||||||
|
parser.add_argument("--token_counts", type=str,
|
||||||
|
help="The token counts in the data_file for MLM.")
|
||||||
|
|
||||||
parser.add_argument("--restrict_ce_to_mask", action='store_true',
|
parser.add_argument("--restrict_ce_to_mask", action='store_true',
|
||||||
help="If true, compute the distilation loss only the [MLM] prediction distribution.")
|
help="If true, compute the distilation loss only the [MLM] prediction distribution.")
|
||||||
|
parser.add_argument("--freeze_pos_embs", action="store_true",
|
||||||
|
help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
|
||||||
|
parser.add_argument("--freeze_token_type_embds", action="store_true",
|
||||||
|
help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
|
||||||
|
|
||||||
parser.add_argument("--n_epoch", type=int, default=3,
|
parser.add_argument("--n_epoch", type=int, default=3,
|
||||||
help="Number of pass on the whole dataset.")
|
help="Number of pass on the whole dataset.")
|
||||||
parser.add_argument("--batch_size", type=int, default=5,
|
parser.add_argument("--batch_size", type=int, default=5,
|
||||||
help="Batch size (for each process).")
|
help="Batch size (for each process).")
|
||||||
parser.add_argument("--tokens_per_batch", type=int, default=-1,
|
|
||||||
help="If specified, modify the batches so that they have approximately this number of tokens.")
|
|
||||||
parser.add_argument("--shuffle", action='store_false',
|
|
||||||
help="If true, shuffle the sequence order. Default is true.")
|
|
||||||
parser.add_argument("--group_by_size", action='store_false',
|
parser.add_argument("--group_by_size", action='store_false',
|
||||||
help="If true, group sequences that have similar length into the same batch. Default is true.")
|
help="If true, group sequences that have similar length into the same batch. Default is true.")
|
||||||
|
|
||||||
@@ -141,6 +173,7 @@ def main():
|
|||||||
parser.add_argument("--checkpoint_interval", type=int, default=4000,
|
parser.add_argument("--checkpoint_interval", type=int, default=4000,
|
||||||
help="Checkpoint interval.")
|
help="Checkpoint interval.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
sanity_checks(args)
|
||||||
|
|
||||||
|
|
||||||
## ARGS ##
|
## ARGS ##
|
||||||
@@ -164,21 +197,19 @@ def main():
|
|||||||
with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
|
with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
|
||||||
json.dump(vars(args), f, indent=4)
|
json.dump(vars(args), f, indent=4)
|
||||||
git_log(args.dump_path)
|
git_log(args.dump_path)
|
||||||
assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \
|
|
||||||
(args.from_pretrained_weights is not None and args.from_pretrained_config is not None)
|
|
||||||
|
|
||||||
|
student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
|
||||||
|
teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
|
||||||
|
|
||||||
### TOKENIZER ###
|
### TOKENIZER ###
|
||||||
if args.teacher_type == 'bert':
|
tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
|
||||||
tokenizer = BertTokenizer.from_pretrained(args.teacher_name)
|
|
||||||
elif args.teacher_type == 'roberta':
|
|
||||||
tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
|
|
||||||
special_tok_ids = {}
|
special_tok_ids = {}
|
||||||
for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
|
for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
|
||||||
idx = tokenizer.all_special_tokens.index(tok_symbol)
|
idx = tokenizer.all_special_tokens.index(tok_symbol)
|
||||||
special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
|
special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
|
||||||
logger.info(f'Special tokens {special_tok_ids}')
|
logger.info(f'Special tokens {special_tok_ids}')
|
||||||
args.special_tok_ids = special_tok_ids
|
args.special_tok_ids = special_tok_ids
|
||||||
|
args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
|
||||||
|
|
||||||
|
|
||||||
## DATA LOADER ##
|
## DATA LOADER ##
|
||||||
@@ -187,35 +218,34 @@ def main():
|
|||||||
data = pickle.load(fp)
|
data = pickle.load(fp)
|
||||||
|
|
||||||
|
|
||||||
assert os.path.isfile(args.token_counts)
|
if args.mlm:
|
||||||
logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
|
logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
|
||||||
with open(args.token_counts, 'rb') as fp:
|
with open(args.token_counts, 'rb') as fp:
|
||||||
counts = pickle.load(fp)
|
counts = pickle.load(fp)
|
||||||
assert len(counts) == args.vocab_size
|
|
||||||
token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
|
token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
|
||||||
for idx in special_tok_ids.values():
|
for idx in special_tok_ids.values():
|
||||||
token_probs[idx] = 0. # do not predict special tokens
|
token_probs[idx] = 0. # do not predict special tokens
|
||||||
token_probs = torch.from_numpy(token_probs)
|
token_probs = torch.from_numpy(token_probs)
|
||||||
|
else:
|
||||||
|
token_probs = None
|
||||||
|
|
||||||
|
|
||||||
train_dataloader = Dataset(params=args, data=data)
|
train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
|
||||||
logger.info(f'Data loader created.')
|
logger.info(f'Data loader created.')
|
||||||
|
|
||||||
|
|
||||||
## STUDENT ##
|
## STUDENT ##
|
||||||
if args.from_pretrained_weights is not None:
|
logger.info(f'Loading student config from {args.student_config}')
|
||||||
assert os.path.isfile(args.from_pretrained_weights)
|
stu_architecture_config = student_config_class.from_pretrained(args.student_config)
|
||||||
assert os.path.isfile(args.from_pretrained_config)
|
stu_architecture_config.output_hidden_states = True
|
||||||
logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
|
|
||||||
logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
|
if args.student_pretrained_weights is not None:
|
||||||
stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
|
logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
|
||||||
stu_architecture_config.output_hidden_states = True
|
student = student_model_class.from_pretrained(args.student_pretrained_weights,
|
||||||
student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
|
config=stu_architecture_config)
|
||||||
config=stu_architecture_config)
|
|
||||||
else:
|
else:
|
||||||
args.vocab_size_or_config_json_file = args.vocab_size
|
student = student_model_class(stu_architecture_config)
|
||||||
stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True)
|
|
||||||
student = DistilBertForMaskedLM(stu_architecture_config)
|
|
||||||
|
|
||||||
|
|
||||||
if args.n_gpu > 0:
|
if args.n_gpu > 0:
|
||||||
@@ -224,18 +254,31 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
## TEACHER ##
|
## TEACHER ##
|
||||||
if args.teacher_type == 'bert':
|
teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
|
||||||
teacher = BertForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
|
|
||||||
elif args.teacher_type == 'roberta':
|
|
||||||
teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
|
|
||||||
if args.n_gpu > 0:
|
if args.n_gpu > 0:
|
||||||
teacher.to(f'cuda:{args.local_rank}')
|
teacher.to(f'cuda:{args.local_rank}')
|
||||||
logger.info(f'Teacher loaded from {args.teacher_name}.')
|
logger.info(f'Teacher loaded from {args.teacher_name}.')
|
||||||
|
|
||||||
|
|
||||||
|
## FREEZING ##
|
||||||
|
if args.freeze_pos_embs:
|
||||||
|
freeze_pos_embeddings(student, args)
|
||||||
|
if args.freeze_token_type_embds:
|
||||||
|
freeze_token_type_embeddings(student, args)
|
||||||
|
|
||||||
|
|
||||||
|
## SANITY CHECKS ##
|
||||||
|
assert student.config.vocab_size == teacher.config.vocab_size
|
||||||
|
assert student.config.hidden_size == teacher.config.hidden_size
|
||||||
|
assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
|
||||||
|
if args.mlm:
|
||||||
|
assert token_probs.size(0) == stu_architecture_config.vocab_size
|
||||||
|
|
||||||
|
|
||||||
## DISTILLER ##
|
## DISTILLER ##
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
distiller = Distiller(params=args,
|
distiller = Distiller(params=args,
|
||||||
dataloader=train_dataloader,
|
dataset=train_lm_seq_dataset,
|
||||||
token_probs=token_probs,
|
token_probs=token_probs,
|
||||||
student=student,
|
student=student,
|
||||||
teacher=teacher)
|
teacher=teacher)
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"activation": "gelu",
|
||||||
|
"attention_dropout": 0.1,
|
||||||
|
"dim": 768,
|
||||||
|
"dropout": 0.1,
|
||||||
|
"hidden_dim": 3072,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"max_position_embeddings": 512,
|
||||||
|
"n_heads": 12,
|
||||||
|
"n_layers": 6,
|
||||||
|
"sinusoidal_pos_embds": true,
|
||||||
|
"tie_weights_": true,
|
||||||
|
"vocab_size": 30522
|
||||||
|
}
|
||||||
|
|
||||||
10
examples/distillation/training_configs/distilgpt2.json
Normal file
10
examples/distillation/training_configs/distilgpt2.json
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"layer_norm_epsilon": 0.00001,
|
||||||
|
"n_ctx": 1024,
|
||||||
|
"n_embd": 768,
|
||||||
|
"n_head": 12,
|
||||||
|
"n_layer": 6,
|
||||||
|
"n_positions": 1024,
|
||||||
|
"vocab_size": 50257
|
||||||
|
}
|
||||||
@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
|
|||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from torch.nn import CrossEntropyLoss, MSELoss
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME,
|
from transformers import (WEIGHTS_NAME,
|
||||||
BertConfig, BertForSequenceClassification, BertTokenizer,
|
BertConfig, BertForSequenceClassification, BertTokenizer,
|
||||||
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
||||||
XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
|
XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
|
||||||
|
|||||||
@@ -26,12 +26,13 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
|
from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig
|
||||||
|
|
||||||
from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||||
from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
|
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
|
||||||
from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
|
from transformers import XLNetLMHeadModel, XLNetTokenizer
|
||||||
from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
|
from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
|
||||||
|
from transformers import XLMWithLMHeadModel, XLMTokenizer
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
@@ -41,13 +42,14 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
|
MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
|
||||||
|
|
||||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
|
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig)), ())
|
||||||
|
|
||||||
MODEL_CLASSES = {
|
MODEL_CLASSES = {
|
||||||
'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
|
'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
|
||||||
'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||||
'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
|
'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
|
||||||
'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
|
'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
|
||||||
|
'xlm': (XLMWithLMHeadModel, XLMTokenizer),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
|
# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
|
||||||
@@ -103,7 +105,8 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
|
|||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
|
||||||
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
|
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False,
|
||||||
|
xlm_lang=None, device='cpu'):
|
||||||
context = torch.tensor(context, dtype=torch.long, device=device)
|
context = torch.tensor(context, dtype=torch.long, device=device)
|
||||||
context = context.unsqueeze(0).repeat(num_samples, 1)
|
context = context.unsqueeze(0).repeat(num_samples, 1)
|
||||||
generated = context
|
generated = context
|
||||||
@@ -121,6 +124,9 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
|
|||||||
target_mapping[0, 0, -1] = 1.0 # predict last token
|
target_mapping[0, 0, -1] = 1.0 # predict last token
|
||||||
inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
|
inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
|
||||||
|
|
||||||
|
if xlm_lang is not None:
|
||||||
|
inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1]).view(1, -1)
|
||||||
|
|
||||||
outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
|
outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
|
||||||
next_token_logits = outputs[0][0, -1, :] / temperature
|
next_token_logits = outputs[0][0, -1, :] / temperature
|
||||||
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
|
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
|
||||||
@@ -137,6 +143,7 @@ def main():
|
|||||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||||
parser.add_argument("--prompt", type=str, default="")
|
parser.add_argument("--prompt", type=str, default="")
|
||||||
parser.add_argument("--padding_text", type=str, default="")
|
parser.add_argument("--padding_text", type=str, default="")
|
||||||
|
parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.")
|
||||||
parser.add_argument("--length", type=int, default=20)
|
parser.add_argument("--length", type=int, default=20)
|
||||||
parser.add_argument("--temperature", type=float, default=1.0)
|
parser.add_argument("--temperature", type=float, default=1.0)
|
||||||
parser.add_argument("--top_k", type=int, default=0)
|
parser.add_argument("--top_k", type=int, default=0)
|
||||||
@@ -170,6 +177,18 @@ def main():
|
|||||||
|
|
||||||
print(args)
|
print(args)
|
||||||
while True:
|
while True:
|
||||||
|
xlm_lang = None
|
||||||
|
# XLM Language usage detailed in the issues #1414
|
||||||
|
if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id') and hasattr(model.config, 'use_lang_emb') \
|
||||||
|
and model.config.use_lang_emb:
|
||||||
|
if args.xlm_lang:
|
||||||
|
language = args.xlm_lang
|
||||||
|
else:
|
||||||
|
language = None
|
||||||
|
while language not in tokenizer.lang2id.keys():
|
||||||
|
language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ")
|
||||||
|
xlm_lang = tokenizer.lang2id[language]
|
||||||
|
|
||||||
raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
|
raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
|
||||||
if args.model_type in ["transfo-xl", "xlnet"]:
|
if args.model_type in ["transfo-xl", "xlnet"]:
|
||||||
# Models with memory likes to have a long prompt for short inputs.
|
# Models with memory likes to have a long prompt for short inputs.
|
||||||
@@ -182,12 +201,15 @@ def main():
|
|||||||
temperature=args.temperature,
|
temperature=args.temperature,
|
||||||
top_k=args.top_k,
|
top_k=args.top_k,
|
||||||
top_p=args.top_p,
|
top_p=args.top_p,
|
||||||
device=args.device,
|
|
||||||
is_xlnet=bool(args.model_type == "xlnet"),
|
is_xlnet=bool(args.model_type == "xlnet"),
|
||||||
|
xlm_lang=xlm_lang,
|
||||||
|
device=args.device,
|
||||||
)
|
)
|
||||||
out = out[0, len(context_tokens):].tolist()
|
out = out[0, len(context_tokens):].tolist()
|
||||||
text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
|
|
||||||
|
text = tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
|
||||||
text = text[: text.find(args.stop_token) if args.stop_token else None]
|
text = text[: text.find(args.stop_token) if args.stop_token else None]
|
||||||
|
|
||||||
print(text)
|
print(text)
|
||||||
if args.prompt:
|
if args.prompt:
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForSequenceClassification, BertTokenizer,
|
BertForSequenceClassification, BertTokenizer,
|
||||||
RobertaConfig,
|
RobertaConfig,
|
||||||
RobertaForSequenceClassification,
|
RobertaForSequenceClassification,
|
||||||
@@ -39,22 +39,29 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLMConfig, XLMForSequenceClassification,
|
XLMConfig, XLMForSequenceClassification,
|
||||||
XLMTokenizer, XLNetConfig,
|
XLMTokenizer, XLNetConfig,
|
||||||
XLNetForSequenceClassification,
|
XLNetForSequenceClassification,
|
||||||
XLNetTokenizer)
|
XLNetTokenizer,
|
||||||
|
DistilBertConfig,
|
||||||
|
DistilBertForSequenceClassification,
|
||||||
|
DistilBertTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
from utils_glue import (compute_metrics, convert_examples_to_features,
|
from transformers import glue_compute_metrics as compute_metrics
|
||||||
output_modes, processors)
|
from transformers import glue_output_modes as output_modes
|
||||||
|
from transformers import glue_processors as processors
|
||||||
|
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig)), ())
|
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig,
|
||||||
|
RobertaConfig, DistilBertConfig)), ())
|
||||||
|
|
||||||
MODEL_CLASSES = {
|
MODEL_CLASSES = {
|
||||||
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
|
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||||
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -128,10 +135,11 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
batch = tuple(t.to(args.device) for t in batch)
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
inputs = {'input_ids': batch[0],
|
inputs = {'input_ids': batch[0],
|
||||||
'attention_mask': batch[1],
|
'attention_mask': batch[1],
|
||||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids
|
|
||||||
'labels': batch[3]}
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
@@ -148,8 +156,8 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
|
|
||||||
tr_loss += loss.item()
|
tr_loss += loss.item()
|
||||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||||
scheduler.step() # Update learning rate schedule
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
scheduler.step() # Update learning rate schedule
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
@@ -218,8 +226,9 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
inputs = {'input_ids': batch[0],
|
inputs = {'input_ids': batch[0],
|
||||||
'attention_mask': batch[1],
|
'attention_mask': batch[1],
|
||||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids
|
|
||||||
'labels': batch[3]}
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
tmp_eval_loss, logits = outputs[:2]
|
tmp_eval_loss, logits = outputs[:2]
|
||||||
|
|
||||||
@@ -272,15 +281,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
# HACK(label indices are swapped in RoBERTa pretrained model)
|
# HACK(label indices are swapped in RoBERTa pretrained model)
|
||||||
label_list[1], label_list[2] = label_list[2], label_list[1]
|
label_list[1], label_list[2] = label_list[2], label_list[1]
|
||||||
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||||
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
|
features = convert_examples_to_features(examples,
|
||||||
cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end
|
tokenizer,
|
||||||
cls_token=tokenizer.cls_token,
|
label_list=label_list,
|
||||||
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
|
max_length=args.max_seq_length,
|
||||||
sep_token=tokenizer.sep_token,
|
output_mode=output_mode,
|
||||||
sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
||||||
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
|
||||||
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
|
|
||||||
)
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
@@ -291,14 +299,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
|
|
||||||
# Convert to Tensors and build dataset
|
# Convert to Tensors and build dataset
|
||||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||||
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||||
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||||
if output_mode == "classification":
|
if output_mode == "classification":
|
||||||
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||||
elif output_mode == "regression":
|
elif output_mode == "regression":
|
||||||
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||||
|
|
||||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
@@ -478,7 +486,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
|||||||
@@ -35,11 +35,12 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
|
from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
|
||||||
BertConfig, BertForMaskedLM, BertTokenizer,
|
BertConfig, BertForMaskedLM, BertTokenizer,
|
||||||
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
||||||
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
||||||
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
|
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
|
||||||
|
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -49,7 +50,8 @@ MODEL_CLASSES = {
|
|||||||
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
||||||
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||||
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
|
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -57,7 +59,7 @@ class TextDataset(Dataset):
|
|||||||
def __init__(self, tokenizer, file_path='train', block_size=512):
|
def __init__(self, tokenizer, file_path='train', block_size=512):
|
||||||
assert os.path.isfile(file_path)
|
assert os.path.isfile(file_path)
|
||||||
directory, filename = os.path.split(file_path)
|
directory, filename = os.path.split(file_path)
|
||||||
cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
|
cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
|
||||||
|
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file):
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
@@ -72,9 +74,8 @@ class TextDataset(Dataset):
|
|||||||
|
|
||||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||||
|
|
||||||
while len(tokenized_text) >= block_size: # Truncate in block of block_size
|
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
||||||
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
|
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
|
||||||
tokenized_text = tokenized_text[block_size:]
|
|
||||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||||
# can change this behavior by adding (model specific) padding.
|
# can change this behavior by adding (model specific) padding.
|
||||||
@@ -186,7 +187,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
labels = labels.to(args.device)
|
labels = labels.to(args.device)
|
||||||
model.train()
|
model.train()
|
||||||
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
|
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
@@ -380,7 +381,7 @@ def main():
|
|||||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.model_type in ["bert", "roberta"] and not args.mlm:
|
if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
|
||||||
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||||
"flag (masked language modeling).")
|
"flag (masked language modeling).")
|
||||||
if args.eval_data_file is None and args.do_eval:
|
if args.eval_data_file is None and args.do_eval:
|
||||||
@@ -479,7 +480,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
|||||||
@@ -32,13 +32,13 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForMultipleChoice, BertTokenizer,
|
BertForMultipleChoice, BertTokenizer,
|
||||||
XLNetConfig, XLNetForMultipleChoice,
|
XLNetConfig, XLNetForMultipleChoice,
|
||||||
XLNetTokenizer, RobertaConfig,
|
XLNetTokenizer, RobertaConfig,
|
||||||
RobertaForMultipleChoice, RobertaTokenizer)
|
RobertaForMultipleChoice, RobertaTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
from utils_multiple_choice import (convert_examples_to_features, processors)
|
from utils_multiple_choice import (convert_examples_to_features, processors)
|
||||||
|
|
||||||
@@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
|
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
|
||||||
'labels': batch[3]}
|
'labels': batch[3]}
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
@@ -508,7 +508,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
@@ -524,7 +524,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
# if args.eval_all_checkpoints: # can not use this to do test!!
|
# if args.eval_all_checkpoints: # can not use this to do test!!
|
||||||
# checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
# checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
# logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
# logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
|||||||
@@ -32,14 +32,15 @@ from tqdm import tqdm, trange
|
|||||||
|
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForQuestionAnswering, BertTokenizer,
|
BertForQuestionAnswering, BertTokenizer,
|
||||||
XLMConfig, XLMForQuestionAnswering,
|
XLMConfig, XLMForQuestionAnswering,
|
||||||
XLMTokenizer, XLNetConfig,
|
XLMTokenizer, XLNetConfig,
|
||||||
XLNetForQuestionAnswering,
|
XLNetForQuestionAnswering,
|
||||||
XLNetTokenizer)
|
XLNetTokenizer,
|
||||||
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||||
RawResult, write_predictions,
|
RawResult, write_predictions,
|
||||||
@@ -59,6 +60,7 @@ MODEL_CLASSES = {
|
|||||||
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
||||||
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
def set_seed(args):
|
def set_seed(args):
|
||||||
@@ -140,7 +142,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
inputs.update({'cls_index': batch[5],
|
inputs.update({'cls_index': batch[5],
|
||||||
'p_mask': batch[6]})
|
'p_mask': batch[6]})
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||||
@@ -508,7 +510,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||||
|
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
|
||||||
|
|||||||
40
examples/run_tf_glue.py
Normal file
40
examples/run_tf_glue.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow_datasets
|
||||||
|
from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
|
||||||
|
|
||||||
|
# Load dataset, tokenizer, model from pretrained model/vocabulary
|
||||||
|
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||||
|
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
|
||||||
|
data = tensorflow_datasets.load('glue/mrpc')
|
||||||
|
|
||||||
|
# Prepare dataset for GLUE as a tf.data.Dataset instance
|
||||||
|
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
|
||||||
|
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
|
||||||
|
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
|
||||||
|
valid_dataset = valid_dataset.batch(64)
|
||||||
|
|
||||||
|
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||||
|
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
|
||||||
|
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||||
|
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
||||||
|
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
|
||||||
|
|
||||||
|
# Train and evaluate using tf.keras.Model.fit()
|
||||||
|
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
|
||||||
|
validation_data=valid_dataset, validation_steps=7)
|
||||||
|
|
||||||
|
# Load the TensorFlow model in PyTorch for inspection
|
||||||
|
model.save_pretrained('./save/')
|
||||||
|
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
|
||||||
|
|
||||||
|
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
|
||||||
|
sentence_0 = "This research was consistent with his findings."
|
||||||
|
sentence_1 = "His findings were compatible with this research."
|
||||||
|
sentence_2 = "His findings were not compatible with this research."
|
||||||
|
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
|
||||||
|
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
|
||||||
|
|
||||||
|
pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
|
||||||
|
pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
|
||||||
|
print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
|
||||||
|
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
|
||||||
@@ -24,7 +24,7 @@ import math
|
|||||||
import collections
|
import collections
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||||
|
|
||||||
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
|
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
|
||||||
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
|
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
|
||||||
|
|||||||
50
hubconf.py
50
hubconf.py
@@ -1,7 +1,7 @@
|
|||||||
from pytorch_transformers import (
|
from transformers import (
|
||||||
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
|
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
|
||||||
)
|
)
|
||||||
from pytorch_transformers.file_utils import add_start_docstrings
|
from transformers.file_utils import add_start_docstrings
|
||||||
|
|
||||||
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
|
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
|
||||||
|
|
||||||
@@ -11,12 +11,12 @@ def config(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.
|
config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
|
config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
|
config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
|
||||||
assert config.output_attention == True
|
assert config.output_attention == True
|
||||||
config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
|
config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
|
||||||
assert config.output_attention == True
|
assert config.output_attention == True
|
||||||
assert unused_kwargs == {'foo': False}
|
assert unused_kwargs == {'foo': False}
|
||||||
|
|
||||||
@@ -31,8 +31,8 @@ def tokenizer(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
|
tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -45,13 +45,13 @@ def model(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -63,13 +63,13 @@ def modelWithLMHead(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
|
return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
|
||||||
@@ -81,13 +81,13 @@ def modelForSequenceClassification(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -100,13 +100,13 @@ def modelForQuestionAnswering(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
|
return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
|
||||||
|
|||||||
@@ -1,75 +0,0 @@
|
|||||||
__version__ = "1.2.0"
|
|
||||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
|
||||||
# default Python logging output behavior when present.
|
|
||||||
# see: https://github.com/abseil/abseil-py/issues/99
|
|
||||||
# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
|
|
||||||
try:
|
|
||||||
import absl.logging
|
|
||||||
absl.logging.set_verbosity('info')
|
|
||||||
absl.logging.set_stderrthreshold('info')
|
|
||||||
absl.logging._warn_preinit_stderr = False
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Tokenizer
|
|
||||||
from .tokenization_utils import (PreTrainedTokenizer)
|
|
||||||
from .tokenization_auto import AutoTokenizer
|
|
||||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
|
||||||
from .tokenization_openai import OpenAIGPTTokenizer
|
|
||||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
|
||||||
from .tokenization_gpt2 import GPT2Tokenizer
|
|
||||||
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
|
||||||
from .tokenization_xlm import XLMTokenizer
|
|
||||||
from .tokenization_roberta import RobertaTokenizer
|
|
||||||
from .tokenization_distilbert import DistilBertTokenizer
|
|
||||||
|
|
||||||
# Configurations
|
|
||||||
from .configuration_utils import PretrainedConfig
|
|
||||||
from .configuration_auto import AutoConfig
|
|
||||||
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
# Modeling
|
|
||||||
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
|
|
||||||
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
|
||||||
AutoModelWithLMHead)
|
|
||||||
|
|
||||||
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
|
|
||||||
BertForMaskedLM, BertForNextSentencePrediction,
|
|
||||||
BertForSequenceClassification, BertForMultipleChoice,
|
|
||||||
BertForTokenClassification, BertForQuestionAnswering,
|
|
||||||
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
|
||||||
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
|
||||||
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
|
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
|
||||||
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
|
||||||
XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
|
|
||||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
|
||||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
|
||||||
XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
|
|
||||||
RobertaForMultipleChoice, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
|
||||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
|
||||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
|
|
||||||
# Optimization
|
|
||||||
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
|
||||||
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
|
||||||
|
|
||||||
# Files and general utilities
|
|
||||||
from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
|
|
||||||
cached_path, add_start_docstrings, add_end_docstrings,
|
|
||||||
WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
|
|
||||||
@@ -1,5 +1,3 @@
|
|||||||
# PyTorch
|
|
||||||
torch>=1.0.0
|
|
||||||
# progress bars in model download and training scripts
|
# progress bars in model download and training scripts
|
||||||
tqdm
|
tqdm
|
||||||
# Accessing files from S3 directly.
|
# Accessing files from S3 directly.
|
||||||
|
|||||||
23
setup.py
23
setup.py
@@ -13,11 +13,11 @@ To create the package for pypi.
|
|||||||
4. Build both the sources and the wheel. Do not change anything in setup.py between
|
4. Build both the sources and the wheel. Do not change anything in setup.py between
|
||||||
creating the wheel and the source distribution (obviously).
|
creating the wheel and the source distribution (obviously).
|
||||||
|
|
||||||
For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
|
For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
|
||||||
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
|
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
|
||||||
|
|
||||||
For the sources, run: "python setup.py sdist"
|
For the sources, run: "python setup.py sdist"
|
||||||
You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
|
You should now have a /dist directory with both .whl and .tar.gz source versions.
|
||||||
|
|
||||||
5. Check that everything looks correct by uploading the package to the pypi test server:
|
5. Check that everything looks correct by uploading the package to the pypi test server:
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ To create the package for pypi.
|
|||||||
(pypi suggest using twine as other methods upload files via plaintext.)
|
(pypi suggest using twine as other methods upload files via plaintext.)
|
||||||
|
|
||||||
Check that you can install it in a virtualenv by running:
|
Check that you can install it in a virtualenv by running:
|
||||||
pip install -i https://testpypi.python.org/pypi pytorch-transformers
|
pip install -i https://testpypi.python.org/pypi transformers
|
||||||
|
|
||||||
6. Upload the final version to actual pypi:
|
6. Upload the final version to actual pypi:
|
||||||
twine upload dist/* -r pypi
|
twine upload dist/* -r pypi
|
||||||
@@ -37,20 +37,19 @@ from io import open
|
|||||||
from setuptools import find_packages, setup
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="pytorch_transformers",
|
name="transformers",
|
||||||
version="1.2.0",
|
version="2.0.0",
|
||||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
|
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||||
author_email="thomas@huggingface.co",
|
author_email="thomas@huggingface.co",
|
||||||
description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
|
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||||
long_description=open("README.md", "r", encoding='utf-8').read(),
|
long_description=open("README.md", "r", encoding='utf-8').read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
|
keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
|
||||||
license='Apache',
|
license='Apache',
|
||||||
url="https://github.com/huggingface/pytorch-transformers",
|
url="https://github.com/huggingface/transformers",
|
||||||
packages=find_packages(exclude=["*.tests", "*.tests.*",
|
packages=find_packages(exclude=["*.tests", "*.tests.*",
|
||||||
"tests.*", "tests"]),
|
"tests.*", "tests"]),
|
||||||
install_requires=['torch>=1.0.0',
|
install_requires=['numpy',
|
||||||
'numpy',
|
|
||||||
'boto3',
|
'boto3',
|
||||||
'requests',
|
'requests',
|
||||||
'tqdm',
|
'tqdm',
|
||||||
@@ -59,7 +58,7 @@ setup(
|
|||||||
'sacremoses'],
|
'sacremoses'],
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
"pytorch_transformers=pytorch_transformers.__main__:main",
|
"transformers=transformers.__main__:main",
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
# python_requires='>=3.5.0',
|
# python_requires='>=3.5.0',
|
||||||
|
|||||||
165
transformers/__init__.py
Normal file
165
transformers/__init__.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
__version__ = "2.0.0"
|
||||||
|
|
||||||
|
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||||
|
# default Python logging output behavior when present.
|
||||||
|
# see: https://github.com/abseil/abseil-py/issues/99
|
||||||
|
# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
|
||||||
|
try:
|
||||||
|
import absl.logging
|
||||||
|
absl.logging.set_verbosity('info')
|
||||||
|
absl.logging.set_stderrthreshold('info')
|
||||||
|
absl.logging._warn_preinit_stderr = False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
# Files and general utilities
|
||||||
|
from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
|
||||||
|
cached_path, add_start_docstrings, add_end_docstrings,
|
||||||
|
WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
|
||||||
|
is_tf_available, is_torch_available)
|
||||||
|
|
||||||
|
from .data import (is_sklearn_available,
|
||||||
|
InputExample, InputFeatures, DataProcessor,
|
||||||
|
glue_output_modes, glue_convert_examples_to_features,
|
||||||
|
glue_processors, glue_tasks_num_labels)
|
||||||
|
|
||||||
|
if is_sklearn_available():
|
||||||
|
from .data import glue_compute_metrics
|
||||||
|
|
||||||
|
# Tokenizers
|
||||||
|
from .tokenization_utils import (PreTrainedTokenizer)
|
||||||
|
from .tokenization_auto import AutoTokenizer
|
||||||
|
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||||
|
from .tokenization_openai import OpenAIGPTTokenizer
|
||||||
|
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||||
|
from .tokenization_gpt2 import GPT2Tokenizer
|
||||||
|
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
||||||
|
from .tokenization_xlm import XLMTokenizer
|
||||||
|
from .tokenization_roberta import RobertaTokenizer
|
||||||
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
|
|
||||||
|
# Configurations
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
from .configuration_auto import AutoConfig
|
||||||
|
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
# Modeling
|
||||||
|
if is_torch_available():
|
||||||
|
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
|
||||||
|
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
||||||
|
AutoModelWithLMHead)
|
||||||
|
|
||||||
|
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
|
||||||
|
BertForMaskedLM, BertForNextSentencePrediction,
|
||||||
|
BertForSequenceClassification, BertForMultipleChoice,
|
||||||
|
BertForTokenClassification, BertForQuestionAnswering,
|
||||||
|
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
||||||
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
||||||
|
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
||||||
|
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
|
||||||
|
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
||||||
|
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
||||||
|
XLNetForSequenceClassification, XLNetForMultipleChoice,
|
||||||
|
XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
|
||||||
|
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
||||||
|
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||||
|
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
||||||
|
XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
|
||||||
|
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
||||||
|
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
||||||
|
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||||
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
# Optimization
|
||||||
|
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
||||||
|
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
||||||
|
|
||||||
|
|
||||||
|
# TensorFlow
|
||||||
|
if is_tf_available():
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
|
||||||
|
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
||||||
|
TFAutoModelWithLMHead)
|
||||||
|
|
||||||
|
from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
|
||||||
|
TFBertModel, TFBertForPreTraining,
|
||||||
|
TFBertForMaskedLM, TFBertForNextSentencePrediction,
|
||||||
|
TFBertForSequenceClassification, TFBertForMultipleChoice,
|
||||||
|
TFBertForTokenClassification, TFBertForQuestionAnswering,
|
||||||
|
load_bert_pt_weights_in_tf2,
|
||||||
|
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
|
||||||
|
TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
|
||||||
|
load_gpt2_pt_weights_in_tf2,
|
||||||
|
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
|
||||||
|
TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
|
||||||
|
load_openai_gpt_pt_weights_in_tf2,
|
||||||
|
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
|
||||||
|
TFTransfoXLModel, TFTransfoXLLMHeadModel,
|
||||||
|
load_transfo_xl_pt_weights_in_tf2,
|
||||||
|
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
||||||
|
TFXLNetModel, TFXLNetLMHeadModel,
|
||||||
|
TFXLNetForSequenceClassification,
|
||||||
|
TFXLNetForQuestionAnsweringSimple,
|
||||||
|
load_xlnet_pt_weights_in_tf2,
|
||||||
|
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
|
||||||
|
TFXLMModel, TFXLMWithLMHeadModel,
|
||||||
|
TFXLMForSequenceClassification,
|
||||||
|
TFXLMForQuestionAnsweringSimple,
|
||||||
|
load_xlm_pt_weights_in_tf2,
|
||||||
|
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
|
||||||
|
TFRobertaModel, TFRobertaForMaskedLM,
|
||||||
|
TFRobertaForSequenceClassification,
|
||||||
|
load_roberta_pt_weights_in_tf2,
|
||||||
|
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
||||||
|
TFDistilBertModel, TFDistilBertForMaskedLM,
|
||||||
|
TFDistilBertForSequenceClassification,
|
||||||
|
TFDistilBertForQuestionAnswering,
|
||||||
|
load_distilbert_pt_weights_in_tf2,
|
||||||
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
# TF 2.0 <=> PyTorch conversion utilities
|
||||||
|
if is_tf_available() and is_torch_available():
|
||||||
|
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
||||||
|
load_pytorch_checkpoint_in_tf2_model,
|
||||||
|
load_pytorch_weights_in_tf2_model,
|
||||||
|
load_pytorch_model_in_tf2_model,
|
||||||
|
load_tf2_checkpoint_in_pytorch_model,
|
||||||
|
load_tf2_weights_in_pytorch_model,
|
||||||
|
load_tf2_model_in_pytorch_model)
|
||||||
|
|
||||||
|
if not is_tf_available() and not is_torch_available():
|
||||||
|
logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
|
||||||
|
"Models won't be available and only tokenizers, configuration"
|
||||||
|
"and file/data utilities can be used.")
|
||||||
@@ -3,36 +3,37 @@ def main():
|
|||||||
import sys
|
import sys
|
||||||
if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
|
if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
|
||||||
print(
|
print(
|
||||||
"Should be used as one of: \n"
|
"This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
|
||||||
">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
|
"It should be used as one of: \n"
|
||||||
">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
|
">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
|
||||||
">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
|
">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
|
||||||
">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
|
">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
|
||||||
">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
|
">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
|
||||||
">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
|
">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
|
||||||
|
">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
|
||||||
else:
|
else:
|
||||||
if sys.argv[1] == "bert":
|
if sys.argv[1] == "bert":
|
||||||
try:
|
try:
|
||||||
from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
|
from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if len(sys.argv) != 5:
|
if len(sys.argv) != 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
|
print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
|
||||||
else:
|
else:
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv.pop()
|
PYTORCH_DUMP_OUTPUT = sys.argv.pop()
|
||||||
TF_CONFIG = sys.argv.pop()
|
TF_CONFIG = sys.argv.pop()
|
||||||
TF_CHECKPOINT = sys.argv.pop()
|
TF_CHECKPOINT = sys.argv.pop()
|
||||||
convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
||||||
elif sys.argv[1] == "gpt":
|
elif sys.argv[1] == "gpt":
|
||||||
from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
|
from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
|
||||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
|
print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
|
||||||
else:
|
else:
|
||||||
OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
|
OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||||
@@ -45,15 +46,15 @@ def main():
|
|||||||
PYTORCH_DUMP_OUTPUT)
|
PYTORCH_DUMP_OUTPUT)
|
||||||
elif sys.argv[1] == "transfo_xl":
|
elif sys.argv[1] == "transfo_xl":
|
||||||
try:
|
try:
|
||||||
from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
|
from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
||||||
else:
|
else:
|
||||||
if 'ckpt' in sys.argv[2].lower():
|
if 'ckpt' in sys.argv[2].lower():
|
||||||
TF_CHECKPOINT = sys.argv[2]
|
TF_CHECKPOINT = sys.argv[2]
|
||||||
@@ -69,16 +70,16 @@ def main():
|
|||||||
convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
|
convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
|
||||||
elif sys.argv[1] == "gpt2":
|
elif sys.argv[1] == "gpt2":
|
||||||
try:
|
try:
|
||||||
from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
|
from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
||||||
else:
|
else:
|
||||||
TF_CHECKPOINT = sys.argv[2]
|
TF_CHECKPOINT = sys.argv[2]
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||||
@@ -89,16 +90,16 @@ def main():
|
|||||||
convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
||||||
elif sys.argv[1] == "xlnet":
|
elif sys.argv[1] == "xlnet":
|
||||||
try:
|
try:
|
||||||
from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
|
from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if len(sys.argv) < 5 or len(sys.argv) > 6:
|
if len(sys.argv) < 5 or len(sys.argv) > 6:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
|
print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
|
||||||
else:
|
else:
|
||||||
TF_CHECKPOINT = sys.argv[2]
|
TF_CHECKPOINT = sys.argv[2]
|
||||||
TF_CONFIG = sys.argv[3]
|
TF_CONFIG = sys.argv[3]
|
||||||
@@ -113,11 +114,11 @@ def main():
|
|||||||
PYTORCH_DUMP_OUTPUT,
|
PYTORCH_DUMP_OUTPUT,
|
||||||
FINETUNING_TASK)
|
FINETUNING_TASK)
|
||||||
elif sys.argv[1] == "xlm":
|
elif sys.argv[1] == "xlm":
|
||||||
from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
|
from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
|
||||||
|
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
|
print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
|
||||||
else:
|
else:
|
||||||
XLM_CHECKPOINT_PATH = sys.argv[2]
|
XLM_CHECKPOINT_PATH = sys.argv[2]
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||||
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class AutoConfig(object):
|
class AutoConfig(object):
|
||||||
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
|
r""":class:`~transformers.AutoConfig` is a generic configuration class
|
||||||
that will be instantiated as one of the configuration classes of the library
|
that will be instantiated as one of the configuration classes of the library
|
||||||
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -76,7 +76,7 @@ class AutoConfig(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
@@ -45,7 +45,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
class BertConfig(PretrainedConfig):
|
class BertConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
|
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a
|
||||||
`BertModel`.
|
`BertModel`.
|
||||||
|
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||||
layer in the Transformer encoder.
|
layer in the Transformer encoder.
|
||||||
hidden_act: The non-linear activation function (function or string) in the
|
hidden_act: The non-linear activation function (function or string) in the
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||||
layers in the embeddings, encoder, and pooler.
|
layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||||
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size_or_config_json_file=30522,
|
||||||
max_position_embeddings=512,
|
max_position_embeddings=512,
|
||||||
sinusoidal_pos_embds=True,
|
sinusoidal_pos_embds=False,
|
||||||
n_layers=6,
|
n_layers=6,
|
||||||
n_heads=12,
|
n_heads=12,
|
||||||
dim=768,
|
dim=768,
|
||||||
@@ -28,7 +28,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
||||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
||||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
|
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
|
||||||
|
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
|
||||||
|
|
||||||
class GPT2Config(PretrainedConfig):
|
class GPT2Config(PretrainedConfig):
|
||||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||||
@@ -36,7 +36,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -95,10 +95,43 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
init_range=0.01,
|
init_range=0.01,
|
||||||
proj_init_std=0.01,
|
proj_init_std=0.01,
|
||||||
init_std=0.02,
|
init_std=0.02,
|
||||||
|
layer_norm_epsilon=1e-5,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Constructs TransfoXLConfig.
|
"""Constructs TransfoXLConfig.
|
||||||
"""
|
"""
|
||||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||||
|
self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
||||||
|
self.cutoffs = []
|
||||||
|
self.cutoffs.extend(cutoffs)
|
||||||
|
self.tie_weight = tie_weight
|
||||||
|
if proj_share_all_but_first:
|
||||||
|
self.tie_projs = [False] + [True] * len(self.cutoffs)
|
||||||
|
else:
|
||||||
|
self.tie_projs = [False] + [False] * len(self.cutoffs)
|
||||||
|
self.d_model = d_model
|
||||||
|
self.d_embed = d_embed
|
||||||
|
self.d_head = d_head
|
||||||
|
self.d_inner = d_inner
|
||||||
|
self.div_val = div_val
|
||||||
|
self.pre_lnorm = pre_lnorm
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
self.tgt_len = tgt_len
|
||||||
|
self.ext_len = ext_len
|
||||||
|
self.mem_len = mem_len
|
||||||
|
self.same_length = same_length
|
||||||
|
self.attn_type = attn_type
|
||||||
|
self.clamp_len = clamp_len
|
||||||
|
self.sample_softmax = sample_softmax
|
||||||
|
self.adaptive = adaptive
|
||||||
|
self.dropout = dropout
|
||||||
|
self.dropatt = dropatt
|
||||||
|
self.untie_r = untie_r
|
||||||
|
self.init = init
|
||||||
|
self.init_range = init_range
|
||||||
|
self.proj_init_std = proj_init_std
|
||||||
|
self.init_std = init_std
|
||||||
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
@@ -106,39 +139,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
json_config = json.loads(reader.read())
|
json_config = json.loads(reader.read())
|
||||||
for key, value in json_config.items():
|
for key, value in json_config.items():
|
||||||
self.__dict__[key] = value
|
self.__dict__[key] = value
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif not isinstance(vocab_size_or_config_json_file, int):
|
||||||
self.n_token = vocab_size_or_config_json_file
|
|
||||||
self.cutoffs = []
|
|
||||||
self.cutoffs.extend(cutoffs)
|
|
||||||
self.tie_weight = tie_weight
|
|
||||||
if proj_share_all_but_first:
|
|
||||||
self.tie_projs = [False] + [True] * len(self.cutoffs)
|
|
||||||
else:
|
|
||||||
self.tie_projs = [False] + [False] * len(self.cutoffs)
|
|
||||||
self.d_model = d_model
|
|
||||||
self.d_embed = d_embed
|
|
||||||
self.d_head = d_head
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.div_val = div_val
|
|
||||||
self.pre_lnorm = pre_lnorm
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
self.tgt_len = tgt_len
|
|
||||||
self.ext_len = ext_len
|
|
||||||
self.mem_len = mem_len
|
|
||||||
self.same_length = same_length
|
|
||||||
self.attn_type = attn_type
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.sample_softmax = sample_softmax
|
|
||||||
self.adaptive = adaptive
|
|
||||||
self.dropout = dropout
|
|
||||||
self.dropatt = dropatt
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.init = init
|
|
||||||
self.init_range = init_range
|
|
||||||
self.proj_init_std = proj_init_std
|
|
||||||
self.init_std = init_std
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
" or the path to a pretrained model config file (str)")
|
" or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
@@ -54,11 +54,12 @@ class PretrainedConfig(object):
|
|||||||
self.output_attentions = kwargs.pop('output_attentions', False)
|
self.output_attentions = kwargs.pop('output_attentions', False)
|
||||||
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
||||||
self.torchscript = kwargs.pop('torchscript', False)
|
self.torchscript = kwargs.pop('torchscript', False)
|
||||||
|
self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
|
||||||
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
""" Save a configuration object to the directory `save_directory`, so that it
|
""" Save a configuration object to the directory `save_directory`, so that it
|
||||||
can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
|
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
|
||||||
"""
|
"""
|
||||||
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
||||||
|
|
||||||
@@ -66,16 +67,17 @@ class PretrainedConfig(object):
|
|||||||
output_config_file = os.path.join(save_directory, CONFIG_NAME)
|
output_config_file = os.path.join(save_directory, CONFIG_NAME)
|
||||||
|
|
||||||
self.to_json_file(output_config_file)
|
self.to_json_file(output_config_file)
|
||||||
|
logger.info("Configuration saved in {}".format(output_config_file))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
|
r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
@@ -174,7 +176,7 @@ class PretrainedConfig(object):
|
|||||||
"""Constructs a `Config` from a Python dictionary of parameters."""
|
"""Constructs a `Config` from a Python dictionary of parameters."""
|
||||||
config = cls(vocab_size_or_config_json_file=-1)
|
config = cls(vocab_size_or_config_json_file=-1)
|
||||||
for key, value in json_object.items():
|
for key, value in json_object.items():
|
||||||
config.__dict__[key] = value
|
setattr(config, key, value)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -56,8 +56,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
dropout: The dropout probabilitiy for all fully connected
|
||||||
layers in the embeddings, encoder, and pooler.
|
layers in the embeddings, encoder, and pooler.
|
||||||
dropatt: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
max_position_embeddings: The maximum sequence length that this model might
|
max_position_embeddings: The maximum sequence length that this model might
|
||||||
ever be used with. Typically set this to something large just in case
|
ever be used with. Typically set this to something large just in case
|
||||||
(e.g., 512 or 1024 or 2048).
|
(e.g., 512 or 1024 or 2048).
|
||||||
@@ -66,7 +64,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
dropout: float, dropout rate.
|
||||||
dropatt: float, dropout rate on attention probabilities.
|
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
init: str, the initialization scheme, either "normal" or "uniform".
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
init_range: float, initialize the parameters with a uniform distribution
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
in [-init_range, init_range]. Only effective when init="uniform".
|
||||||
@@ -49,14 +49,11 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
dropout: The dropout probabilitiy for all fully connected
|
||||||
layers in the embeddings, encoder, and pooler.
|
layers in the embeddings, encoder, and pooler.
|
||||||
dropatt: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
dropout: float, dropout rate.
|
||||||
dropatt: float, dropout rate on attention probabilities.
|
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
init: str, the initialization scheme, either "normal" or "uniform".
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
init_range: float, initialize the parameters with a uniform distribution
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
in [-init_range, init_range]. Only effective when init="uniform".
|
||||||
@@ -80,6 +77,7 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
n_layer=24,
|
n_layer=24,
|
||||||
n_head=16,
|
n_head=16,
|
||||||
d_inner=4096,
|
d_inner=4096,
|
||||||
|
max_position_embeddings=512,
|
||||||
ff_activation="gelu",
|
ff_activation="gelu",
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
attn_type="bi",
|
attn_type="bi",
|
||||||
@@ -112,7 +110,7 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||||
json_config = json.loads(reader.read())
|
json_config = json.loads(reader.read())
|
||||||
for key, value in json_config.items():
|
for key, value in json_config.items():
|
||||||
self.__dict__[key] = value
|
setattr(config, key, value)
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
self.n_token = vocab_size_or_config_json_file
|
self.n_token = vocab_size_or_config_json_file
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
@@ -21,7 +21,7 @@ from __future__ import print_function
|
|||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
|
from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@@ -20,7 +20,7 @@ import argparse
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from pytorch_transformers import BertModel
|
from transformers import BertModel
|
||||||
|
|
||||||
|
|
||||||
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
|
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
|
||||||
@@ -21,7 +21,7 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
from transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
GPT2Config,
|
GPT2Config,
|
||||||
GPT2Model,
|
GPT2Model,
|
||||||
load_tf_weights_in_gpt2)
|
load_tf_weights_in_gpt2)
|
||||||
@@ -21,7 +21,7 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
from transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
OpenAIGPTConfig,
|
OpenAIGPTConfig,
|
||||||
OpenAIGPTModel,
|
OpenAIGPTModel,
|
||||||
load_tf_weights_in_openai_gpt)
|
load_tf_weights_in_openai_gpt)
|
||||||
234
transformers/convert_pytorch_checkpoint_to_tf2.py
Normal file
234
transformers/convert_pytorch_checkpoint_to_tf2.py
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Convert pytorch checkpoints to TensorFlow """
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from transformers import is_torch_available, cached_path
|
||||||
|
|
||||||
|
from transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
else:
|
||||||
|
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,) = (
|
||||||
|
None, None, None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None, None,
|
||||||
|
None, None, None,)
|
||||||
|
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
MODEL_CLASSES = {
|
||||||
|
'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'roberta': (RobertaConfig, TFRobertaForMaskedLM, load_roberta_pt_weights_in_tf2, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, load_distilbert_pt_weights_in_tf2, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
}
|
||||||
|
|
||||||
|
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
|
||||||
|
if model_type not in MODEL_CLASSES:
|
||||||
|
raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
|
||||||
|
|
||||||
|
config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
|
||||||
|
|
||||||
|
# Initialise TF model
|
||||||
|
if config_file in aws_config_map:
|
||||||
|
config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
|
||||||
|
config = config_class.from_json_file(config_file)
|
||||||
|
config.output_hidden_states = True
|
||||||
|
config.output_attentions = True
|
||||||
|
print("Building TensorFlow model from configuration: {}".format(str(config)))
|
||||||
|
tf_model = model_class(config)
|
||||||
|
|
||||||
|
# Load weights from tf checkpoint
|
||||||
|
if pytorch_checkpoint_path in aws_model_maps:
|
||||||
|
pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
|
||||||
|
tf_model = loading_fct(tf_model, pytorch_checkpoint_path)
|
||||||
|
|
||||||
|
if compare_with_pt_model:
|
||||||
|
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
tf_inputs = tf.constant(inputs_list)
|
||||||
|
tfo = tf_model(tf_inputs, training=False) # build the network
|
||||||
|
|
||||||
|
pt_model = pt_model_class.from_pretrained(None,
|
||||||
|
config=config,
|
||||||
|
state_dict=torch.load(pytorch_checkpoint_path,
|
||||||
|
map_location='cpu'))
|
||||||
|
pt_inputs = torch.tensor(inputs_list)
|
||||||
|
with torch.no_grad():
|
||||||
|
pto = pt_model(pt_inputs)
|
||||||
|
|
||||||
|
np_pt = pto[0].detach().numpy()
|
||||||
|
np_tf = tfo[0].numpy()
|
||||||
|
diff = np.amax(np.abs(np_pt - np_tf))
|
||||||
|
print("Max absolute difference between models outputs {}".format(diff))
|
||||||
|
assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
|
||||||
|
|
||||||
|
# Save pytorch-model
|
||||||
|
print("Save TensorFlow model to {}".format(tf_dump_path))
|
||||||
|
tf_model.save_weights(tf_dump_path, save_format='h5')
|
||||||
|
|
||||||
|
|
||||||
|
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
|
||||||
|
compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
|
||||||
|
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
|
||||||
|
|
||||||
|
if args_model_type is None:
|
||||||
|
model_types = list(MODEL_CLASSES.keys())
|
||||||
|
else:
|
||||||
|
model_types = [args_model_type]
|
||||||
|
|
||||||
|
for j, model_type in enumerate(model_types, start=1):
|
||||||
|
print("=" * 100)
|
||||||
|
print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
|
||||||
|
print("=" * 100)
|
||||||
|
if model_type not in MODEL_CLASSES:
|
||||||
|
raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
|
||||||
|
|
||||||
|
config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
|
||||||
|
|
||||||
|
if model_shortcut_names_or_path is None:
|
||||||
|
model_shortcut_names_or_path = list(aws_model_maps.keys())
|
||||||
|
if config_shortcut_names_or_path is None:
|
||||||
|
config_shortcut_names_or_path = model_shortcut_names_or_path
|
||||||
|
|
||||||
|
for i, (model_shortcut_name, config_shortcut_name) in enumerate(
|
||||||
|
zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
|
||||||
|
print("-" * 100)
|
||||||
|
if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
|
||||||
|
if not only_convert_finetuned_models:
|
||||||
|
print(" Skipping finetuned checkpoint {}".format(model_shortcut_name))
|
||||||
|
continue
|
||||||
|
model_type = model_shortcut_name
|
||||||
|
elif only_convert_finetuned_models:
|
||||||
|
print(" Skipping not finetuned checkpoint {}".format(model_shortcut_name))
|
||||||
|
continue
|
||||||
|
print(" Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
|
||||||
|
print("-" * 100)
|
||||||
|
|
||||||
|
if config_shortcut_name in aws_config_map:
|
||||||
|
config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
|
||||||
|
else:
|
||||||
|
config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
|
||||||
|
|
||||||
|
if model_shortcut_name in aws_model_maps:
|
||||||
|
model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
|
||||||
|
else:
|
||||||
|
model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
|
||||||
|
|
||||||
|
convert_pt_checkpoint_to_tf(model_type,
|
||||||
|
model_file,
|
||||||
|
config_file,
|
||||||
|
os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
|
||||||
|
compare_with_pt_model=compare_with_pt_model)
|
||||||
|
os.remove(config_file)
|
||||||
|
os.remove(model_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--tf_dump_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the output Tensorflow dump file.")
|
||||||
|
parser.add_argument("--model_type",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
|
||||||
|
parser.add_argument("--pytorch_checkpoint_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
|
||||||
|
"If not given, will download and convert all the checkpoints from AWS.")
|
||||||
|
parser.add_argument("--config_file",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
help = "The config json file corresponding to the pre-trained model. \n"
|
||||||
|
"This specifies the model architecture. If not given and "
|
||||||
|
"--pytorch_checkpoint_path is not given or is a shortcut name"
|
||||||
|
"use the configuration associated to the shortcut name on the AWS")
|
||||||
|
parser.add_argument("--compare_with_pt_model",
|
||||||
|
action='store_true',
|
||||||
|
help = "Compare Tensorflow and PyTorch model predictions.")
|
||||||
|
parser.add_argument("--use_cached_models",
|
||||||
|
action='store_true',
|
||||||
|
help = "Use cached models if possible instead of updating to latest checkpoint versions.")
|
||||||
|
parser.add_argument("--only_convert_finetuned_models",
|
||||||
|
action='store_true',
|
||||||
|
help = "Only convert finetuned models.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# if args.pytorch_checkpoint_path is not None:
|
||||||
|
# convert_pt_checkpoint_to_tf(args.model_type.lower(),
|
||||||
|
# args.pytorch_checkpoint_path,
|
||||||
|
# args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
|
||||||
|
# args.tf_dump_path,
|
||||||
|
# compare_with_pt_model=args.compare_with_pt_model,
|
||||||
|
# use_cached_models=args.use_cached_models)
|
||||||
|
# else:
|
||||||
|
convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
|
||||||
|
args.tf_dump_path,
|
||||||
|
model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
|
||||||
|
config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
|
||||||
|
compare_with_pt_model=args.compare_with_pt_model,
|
||||||
|
use_cached_models=args.use_cached_models,
|
||||||
|
only_convert_finetuned_models=args.only_convert_finetuned_models)
|
||||||
@@ -23,12 +23,12 @@ import torch
|
|||||||
|
|
||||||
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
||||||
from fairseq.modules import TransformerSentenceEncoderLayer
|
from fairseq.modules import TransformerSentenceEncoderLayer
|
||||||
from pytorch_transformers import (BertConfig, BertEncoder,
|
from transformers import (BertConfig, BertEncoder,
|
||||||
BertIntermediate, BertLayer,
|
BertIntermediate, BertLayer,
|
||||||
BertModel, BertOutput,
|
BertModel, BertOutput,
|
||||||
BertSelfAttention,
|
BertSelfAttention,
|
||||||
BertSelfOutput)
|
BertSelfOutput)
|
||||||
from pytorch_transformers import (RobertaEmbeddings,
|
from transformers import (RobertaEmbeddings,
|
||||||
RobertaForMaskedLM,
|
RobertaForMaskedLM,
|
||||||
RobertaForSequenceClassification,
|
RobertaForSequenceClassification,
|
||||||
RobertaModel)
|
RobertaModel)
|
||||||
@@ -23,12 +23,12 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import pytorch_transformers.tokenization_transfo_xl as data_utils
|
import transformers.tokenization_transfo_xl as data_utils
|
||||||
|
|
||||||
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
from transformers import CONFIG_NAME, WEIGHTS_NAME
|
||||||
from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
|
from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
|
||||||
load_tf_weights_in_transfo_xl)
|
load_tf_weights_in_transfo_xl)
|
||||||
from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
|
from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
|
||||||
|
|
||||||
if sys.version_info[0] == 2:
|
if sys.version_info[0] == 2:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
@@ -23,8 +23,8 @@ from io import open
|
|||||||
import torch
|
import torch
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
from transformers import CONFIG_NAME, WEIGHTS_NAME
|
||||||
from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
|
from transformers.tokenization_xlm import VOCAB_FILES_NAMES
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@@ -33,7 +33,15 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
|
|||||||
# Load checkpoint
|
# Load checkpoint
|
||||||
chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
|
chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
|
||||||
|
|
||||||
model = chkpt['model']
|
state_dict = chkpt['model']
|
||||||
|
|
||||||
|
# We have the base model one level deeper than the original XLM repository
|
||||||
|
two_levels_state_dict = {}
|
||||||
|
for k, v in state_dict.items():
|
||||||
|
if 'pred_layer' in k:
|
||||||
|
two_levels_state_dict[k] = v
|
||||||
|
else:
|
||||||
|
two_levels_state_dict['transformer.' + k] = v
|
||||||
|
|
||||||
config = chkpt['params']
|
config = chkpt['params']
|
||||||
config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
|
config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
|
||||||
@@ -47,7 +55,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
|
|||||||
pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file']
|
pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file']
|
||||||
|
|
||||||
print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
|
print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
|
||||||
torch.save(model, pytorch_weights_dump_path)
|
torch.save(two_levels_state_dict, pytorch_weights_dump_path)
|
||||||
|
|
||||||
print("Save configuration file to {}".format(pytorch_config_dump_path))
|
print("Save configuration file to {}".format(pytorch_config_dump_path))
|
||||||
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
|
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
|
||||||
@@ -22,7 +22,7 @@ import os
|
|||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
from transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
XLNetConfig,
|
XLNetConfig,
|
||||||
XLNetLMHeadModel, XLNetForQuestionAnswering,
|
XLNetLMHeadModel, XLNetForQuestionAnswering,
|
||||||
XLNetForSequenceClassification,
|
XLNetForSequenceClassification,
|
||||||
6
transformers/data/__init__.py
Normal file
6
transformers/data/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from .processors import InputExample, InputFeatures, DataProcessor
|
||||||
|
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
|
|
||||||
|
from .metrics import is_sklearn_available
|
||||||
|
if is_sklearn_available():
|
||||||
|
from .metrics import glue_compute_metrics
|
||||||
83
transformers/data/metrics/__init__.py
Normal file
83
transformers/data/metrics/__init__.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from scipy.stats import pearsonr, spearmanr
|
||||||
|
from sklearn.metrics import matthews_corrcoef, f1_score
|
||||||
|
_has_sklearn = True
|
||||||
|
except (AttributeError, ImportError) as e:
|
||||||
|
logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
|
||||||
|
_has_sklearn = False
|
||||||
|
|
||||||
|
def is_sklearn_available():
|
||||||
|
return _has_sklearn
|
||||||
|
|
||||||
|
if _has_sklearn:
|
||||||
|
|
||||||
|
def simple_accuracy(preds, labels):
|
||||||
|
return (preds == labels).mean()
|
||||||
|
|
||||||
|
|
||||||
|
def acc_and_f1(preds, labels):
|
||||||
|
acc = simple_accuracy(preds, labels)
|
||||||
|
f1 = f1_score(y_true=labels, y_pred=preds)
|
||||||
|
return {
|
||||||
|
"acc": acc,
|
||||||
|
"f1": f1,
|
||||||
|
"acc_and_f1": (acc + f1) / 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def pearson_and_spearman(preds, labels):
|
||||||
|
pearson_corr = pearsonr(preds, labels)[0]
|
||||||
|
spearman_corr = spearmanr(preds, labels)[0]
|
||||||
|
return {
|
||||||
|
"pearson": pearson_corr,
|
||||||
|
"spearmanr": spearman_corr,
|
||||||
|
"corr": (pearson_corr + spearman_corr) / 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def glue_compute_metrics(task_name, preds, labels):
|
||||||
|
assert len(preds) == len(labels)
|
||||||
|
if task_name == "cola":
|
||||||
|
return {"mcc": matthews_corrcoef(labels, preds)}
|
||||||
|
elif task_name == "sst-2":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "mrpc":
|
||||||
|
return acc_and_f1(preds, labels)
|
||||||
|
elif task_name == "sts-b":
|
||||||
|
return pearson_and_spearman(preds, labels)
|
||||||
|
elif task_name == "qqp":
|
||||||
|
return acc_and_f1(preds, labels)
|
||||||
|
elif task_name == "mnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "mnli-mm":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "qnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "rte":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "wnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
else:
|
||||||
|
raise KeyError(task_name)
|
||||||
3
transformers/data/processors/__init__.py
Normal file
3
transformers/data/processors/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .utils import InputExample, InputFeatures, DataProcessor
|
||||||
|
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
|
|
||||||
@@ -13,84 +13,154 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" BERT classification fine-tuning: utilities to work with GLUE tasks """
|
""" GLUE processors and helpers """
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
from io import open
|
|
||||||
|
|
||||||
from scipy.stats import pearsonr, spearmanr
|
from .utils import DataProcessor, InputExample, InputFeatures
|
||||||
from sklearn.metrics import matthews_corrcoef, f1_score
|
from ...file_utils import is_tf_available
|
||||||
|
|
||||||
|
if is_tf_available():
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class InputExample(object):
|
def glue_convert_examples_to_features(examples, tokenizer,
|
||||||
"""A single training/test example for simple sequence classification."""
|
max_length=512,
|
||||||
|
task=None,
|
||||||
|
label_list=None,
|
||||||
|
output_mode=None,
|
||||||
|
pad_on_left=False,
|
||||||
|
pad_token=0,
|
||||||
|
pad_token_segment_id=0,
|
||||||
|
mask_padding_with_zero=True):
|
||||||
|
"""
|
||||||
|
Loads a data file into a list of ``InputFeatures``
|
||||||
|
|
||||||
def __init__(self, guid, text_a, text_b=None, label=None):
|
Args:
|
||||||
"""Constructs a InputExample.
|
examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
|
||||||
|
tokenizer: Instance of a tokenizer that will tokenize the examples
|
||||||
|
max_length: Maximum example length
|
||||||
|
task: GLUE task
|
||||||
|
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
|
||||||
|
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
|
||||||
|
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
|
||||||
|
pad_token: Padding token
|
||||||
|
pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
|
||||||
|
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
|
||||||
|
and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
|
||||||
|
actual values)
|
||||||
|
|
||||||
Args:
|
Returns:
|
||||||
guid: Unique id for the example.
|
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
|
||||||
text_a: string. The untokenized text of the first sequence. For single
|
containing the task-specific features. If the input is a list of ``InputExamples``, will return
|
||||||
sequence tasks, only this sequence must be specified.
|
a list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||||
text_b: (Optional) string. The untokenized text of the second sequence.
|
|
||||||
Only must be specified for sequence pair tasks.
|
|
||||||
label: (Optional) string. The label of the example. This should be
|
|
||||||
specified for train and dev examples, but not for test examples.
|
|
||||||
"""
|
|
||||||
self.guid = guid
|
|
||||||
self.text_a = text_a
|
|
||||||
self.text_b = text_b
|
|
||||||
self.label = label
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
is_tf_dataset = False
|
||||||
|
if is_tf_available() and isinstance(examples, tf.data.Dataset):
|
||||||
|
is_tf_dataset = True
|
||||||
|
|
||||||
class InputFeatures(object):
|
if task is not None:
|
||||||
"""A single set of features of data."""
|
processor = glue_processors[task]()
|
||||||
|
if label_list is None:
|
||||||
|
label_list = processor.get_labels()
|
||||||
|
logger.info("Using label list %s for task %s" % (label_list, task))
|
||||||
|
if output_mode is None:
|
||||||
|
output_mode = glue_output_modes[task]
|
||||||
|
logger.info("Using output mode %s for task %s" % (output_mode, task))
|
||||||
|
|
||||||
def __init__(self, input_ids, input_mask, segment_ids, label_id):
|
label_map = {label: i for i, label in enumerate(label_list)}
|
||||||
self.input_ids = input_ids
|
|
||||||
self.input_mask = input_mask
|
|
||||||
self.segment_ids = segment_ids
|
|
||||||
self.label_id = label_id
|
|
||||||
|
|
||||||
|
features = []
|
||||||
|
for (ex_index, example) in enumerate(examples):
|
||||||
|
if ex_index % 10000 == 0:
|
||||||
|
logger.info("Writing example %d" % (ex_index))
|
||||||
|
if is_tf_dataset:
|
||||||
|
example = processor.get_example_from_tensor_dict(example)
|
||||||
|
|
||||||
class DataProcessor(object):
|
inputs = tokenizer.encode_plus(
|
||||||
"""Base class for data converters for sequence classification data sets."""
|
example.text_a,
|
||||||
|
example.text_b,
|
||||||
|
add_special_tokens=True,
|
||||||
|
max_length=max_length,
|
||||||
|
truncate_first_sequence=True # We're truncating the first sequence in priority
|
||||||
|
)
|
||||||
|
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||||
"""Gets a collection of `InputExample`s for the train set."""
|
# tokens are attended to.
|
||||||
raise NotImplementedError()
|
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
||||||
|
|
||||||
def get_dev_examples(self, data_dir):
|
# Zero-pad up to the sequence length.
|
||||||
"""Gets a collection of `InputExample`s for the dev set."""
|
padding_length = max_length - len(input_ids)
|
||||||
raise NotImplementedError()
|
if pad_on_left:
|
||||||
|
input_ids = ([pad_token] * padding_length) + input_ids
|
||||||
|
attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
|
||||||
|
token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
|
||||||
|
else:
|
||||||
|
input_ids = input_ids + ([pad_token] * padding_length)
|
||||||
|
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
|
||||||
|
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
|
||||||
|
|
||||||
def get_labels(self):
|
assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
|
||||||
"""Gets the list of labels for this data set."""
|
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
|
||||||
raise NotImplementedError()
|
assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
|
||||||
|
|
||||||
@classmethod
|
if output_mode == "classification":
|
||||||
def _read_tsv(cls, input_file, quotechar=None):
|
label = label_map[example.label]
|
||||||
"""Reads a tab separated value file."""
|
elif output_mode == "regression":
|
||||||
with open(input_file, "r", encoding="utf-8-sig") as f:
|
label = float(example.label)
|
||||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
else:
|
||||||
lines = []
|
raise KeyError(output_mode)
|
||||||
for line in reader:
|
|
||||||
if sys.version_info[0] == 2:
|
if ex_index < 5:
|
||||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
logger.info("*** Example ***")
|
||||||
lines.append(line)
|
logger.info("guid: %s" % (example.guid))
|
||||||
return lines
|
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||||
|
logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
|
||||||
|
logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
|
||||||
|
logger.info("label: %s (id = %d)" % (example.label, label))
|
||||||
|
|
||||||
|
features.append(
|
||||||
|
InputFeatures(input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
label=label))
|
||||||
|
|
||||||
|
if is_tf_available() and is_tf_dataset:
|
||||||
|
def gen():
|
||||||
|
for ex in features:
|
||||||
|
yield ({'input_ids': ex.input_ids,
|
||||||
|
'attention_mask': ex.attention_mask,
|
||||||
|
'token_type_ids': ex.token_type_ids},
|
||||||
|
ex.label)
|
||||||
|
|
||||||
|
return tf.data.Dataset.from_generator(gen,
|
||||||
|
({'input_ids': tf.int32,
|
||||||
|
'attention_mask': tf.int32,
|
||||||
|
'token_type_ids': tf.int32},
|
||||||
|
tf.int64),
|
||||||
|
({'input_ids': tf.TensorShape([None]),
|
||||||
|
'attention_mask': tf.TensorShape([None]),
|
||||||
|
'token_type_ids': tf.TensorShape([None])},
|
||||||
|
tf.TensorShape([])))
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
class MrpcProcessor(DataProcessor):
|
class MrpcProcessor(DataProcessor):
|
||||||
"""Processor for the MRPC data set (GLUE version)."""
|
"""Processor for the MRPC data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
|
logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
|
||||||
@@ -124,6 +194,13 @@ class MrpcProcessor(DataProcessor):
|
|||||||
class MnliProcessor(DataProcessor):
|
class MnliProcessor(DataProcessor):
|
||||||
"""Processor for the MultiNLI data set (GLUE version)."""
|
"""Processor for the MultiNLI data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['premise'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['hypothesis'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -167,6 +244,13 @@ class MnliMismatchedProcessor(MnliProcessor):
|
|||||||
class ColaProcessor(DataProcessor):
|
class ColaProcessor(DataProcessor):
|
||||||
"""Processor for the CoLA data set (GLUE version)."""
|
"""Processor for the CoLA data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence'].numpy().decode('utf-8'),
|
||||||
|
None,
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -196,6 +280,13 @@ class ColaProcessor(DataProcessor):
|
|||||||
class Sst2Processor(DataProcessor):
|
class Sst2Processor(DataProcessor):
|
||||||
"""Processor for the SST-2 data set (GLUE version)."""
|
"""Processor for the SST-2 data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence'].numpy().decode('utf-8'),
|
||||||
|
None,
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -227,6 +318,13 @@ class Sst2Processor(DataProcessor):
|
|||||||
class StsbProcessor(DataProcessor):
|
class StsbProcessor(DataProcessor):
|
||||||
"""Processor for the STS-B data set (GLUE version)."""
|
"""Processor for the STS-B data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -259,6 +357,13 @@ class StsbProcessor(DataProcessor):
|
|||||||
class QqpProcessor(DataProcessor):
|
class QqpProcessor(DataProcessor):
|
||||||
"""Processor for the QQP data set (GLUE version)."""
|
"""Processor for the QQP data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['question1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['question2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -294,6 +399,13 @@ class QqpProcessor(DataProcessor):
|
|||||||
class QnliProcessor(DataProcessor):
|
class QnliProcessor(DataProcessor):
|
||||||
"""Processor for the QNLI data set (GLUE version)."""
|
"""Processor for the QNLI data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['question'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -302,7 +414,7 @@ class QnliProcessor(DataProcessor):
|
|||||||
def get_dev_examples(self, data_dir):
|
def get_dev_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
|
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
|
||||||
"dev_matched")
|
"dev_matched")
|
||||||
|
|
||||||
def get_labels(self):
|
def get_labels(self):
|
||||||
@@ -327,6 +439,13 @@ class QnliProcessor(DataProcessor):
|
|||||||
class RteProcessor(DataProcessor):
|
class RteProcessor(DataProcessor):
|
||||||
"""Processor for the RTE data set (GLUE version)."""
|
"""Processor for the RTE data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -359,6 +478,13 @@ class RteProcessor(DataProcessor):
|
|||||||
class WnliProcessor(DataProcessor):
|
class WnliProcessor(DataProcessor):
|
||||||
"""Processor for the WNLI data set (GLUE version)."""
|
"""Processor for the WNLI data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -387,198 +513,19 @@ class WnliProcessor(DataProcessor):
|
|||||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
return examples
|
return examples
|
||||||
|
|
||||||
|
glue_tasks_num_labels = {
|
||||||
|
"cola": 2,
|
||||||
|
"mnli": 3,
|
||||||
|
"mrpc": 2,
|
||||||
|
"sst-2": 2,
|
||||||
|
"sts-b": 1,
|
||||||
|
"qqp": 2,
|
||||||
|
"qnli": 2,
|
||||||
|
"rte": 2,
|
||||||
|
"wnli": 2,
|
||||||
|
}
|
||||||
|
|
||||||
def convert_examples_to_features(examples, label_list, max_seq_length,
|
glue_processors = {
|
||||||
tokenizer, output_mode,
|
|
||||||
cls_token_at_end=False,
|
|
||||||
cls_token='[CLS]',
|
|
||||||
cls_token_segment_id=1,
|
|
||||||
sep_token='[SEP]',
|
|
||||||
sep_token_extra=False,
|
|
||||||
pad_on_left=False,
|
|
||||||
pad_token=0,
|
|
||||||
pad_token_segment_id=0,
|
|
||||||
sequence_a_segment_id=0,
|
|
||||||
sequence_b_segment_id=1,
|
|
||||||
mask_padding_with_zero=True):
|
|
||||||
""" Loads a data file into a list of `InputBatch`s
|
|
||||||
`cls_token_at_end` define the location of the CLS token:
|
|
||||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
|
||||||
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
|
|
||||||
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
|
|
||||||
"""
|
|
||||||
|
|
||||||
label_map = {label : i for i, label in enumerate(label_list)}
|
|
||||||
|
|
||||||
features = []
|
|
||||||
for (ex_index, example) in enumerate(examples):
|
|
||||||
if ex_index % 10000 == 0:
|
|
||||||
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
|
|
||||||
|
|
||||||
tokens_a = tokenizer.tokenize(example.text_a)
|
|
||||||
|
|
||||||
tokens_b = None
|
|
||||||
if example.text_b:
|
|
||||||
tokens_b = tokenizer.tokenize(example.text_b)
|
|
||||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
|
||||||
# length is less than the specified length.
|
|
||||||
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
|
|
||||||
special_tokens_count = 4 if sep_token_extra else 3
|
|
||||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
|
|
||||||
else:
|
|
||||||
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
|
|
||||||
special_tokens_count = 3 if sep_token_extra else 2
|
|
||||||
if len(tokens_a) > max_seq_length - special_tokens_count:
|
|
||||||
tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
|
|
||||||
|
|
||||||
# The convention in BERT is:
|
|
||||||
# (a) For sequence pairs:
|
|
||||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
|
||||||
# (b) For single sequences:
|
|
||||||
# tokens: [CLS] the dog is hairy . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0
|
|
||||||
#
|
|
||||||
# Where "type_ids" are used to indicate whether this is the first
|
|
||||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
|
||||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
|
||||||
# embedding vector (and position vector). This is not *strictly* necessary
|
|
||||||
# since the [SEP] token unambiguously separates the sequences, but it makes
|
|
||||||
# it easier for the model to learn the concept of sequences.
|
|
||||||
#
|
|
||||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
|
||||||
# used as as the "sentence vector". Note that this only makes sense because
|
|
||||||
# the entire model is fine-tuned.
|
|
||||||
tokens = tokens_a + [sep_token]
|
|
||||||
if sep_token_extra:
|
|
||||||
# roberta uses an extra separator b/w pairs of sentences
|
|
||||||
tokens += [sep_token]
|
|
||||||
segment_ids = [sequence_a_segment_id] * len(tokens)
|
|
||||||
|
|
||||||
if tokens_b:
|
|
||||||
tokens += tokens_b + [sep_token]
|
|
||||||
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
|
|
||||||
|
|
||||||
if cls_token_at_end:
|
|
||||||
tokens = tokens + [cls_token]
|
|
||||||
segment_ids = segment_ids + [cls_token_segment_id]
|
|
||||||
else:
|
|
||||||
tokens = [cls_token] + tokens
|
|
||||||
segment_ids = [cls_token_segment_id] + segment_ids
|
|
||||||
|
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
||||||
|
|
||||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
|
||||||
# tokens are attended to.
|
|
||||||
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
|
||||||
|
|
||||||
# Zero-pad up to the sequence length.
|
|
||||||
padding_length = max_seq_length - len(input_ids)
|
|
||||||
if pad_on_left:
|
|
||||||
input_ids = ([pad_token] * padding_length) + input_ids
|
|
||||||
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
|
|
||||||
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
|
|
||||||
else:
|
|
||||||
input_ids = input_ids + ([pad_token] * padding_length)
|
|
||||||
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
|
|
||||||
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
|
|
||||||
|
|
||||||
assert len(input_ids) == max_seq_length
|
|
||||||
assert len(input_mask) == max_seq_length
|
|
||||||
assert len(segment_ids) == max_seq_length
|
|
||||||
|
|
||||||
if output_mode == "classification":
|
|
||||||
label_id = label_map[example.label]
|
|
||||||
elif output_mode == "regression":
|
|
||||||
label_id = float(example.label)
|
|
||||||
else:
|
|
||||||
raise KeyError(output_mode)
|
|
||||||
|
|
||||||
if ex_index < 5:
|
|
||||||
logger.info("*** Example ***")
|
|
||||||
logger.info("guid: %s" % (example.guid))
|
|
||||||
logger.info("tokens: %s" % " ".join(
|
|
||||||
[str(x) for x in tokens]))
|
|
||||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
|
||||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
|
||||||
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
|
||||||
logger.info("label: %s (id = %d)" % (example.label, label_id))
|
|
||||||
|
|
||||||
features.append(
|
|
||||||
InputFeatures(input_ids=input_ids,
|
|
||||||
input_mask=input_mask,
|
|
||||||
segment_ids=segment_ids,
|
|
||||||
label_id=label_id))
|
|
||||||
return features
|
|
||||||
|
|
||||||
|
|
||||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
|
||||||
"""Truncates a sequence pair in place to the maximum length."""
|
|
||||||
|
|
||||||
# This is a simple heuristic which will always truncate the longer sequence
|
|
||||||
# one token at a time. This makes more sense than truncating an equal percent
|
|
||||||
# of tokens from each, since if one sequence is very short then each token
|
|
||||||
# that's truncated likely contains more information than a longer sequence.
|
|
||||||
while True:
|
|
||||||
total_length = len(tokens_a) + len(tokens_b)
|
|
||||||
if total_length <= max_length:
|
|
||||||
break
|
|
||||||
if len(tokens_a) > len(tokens_b):
|
|
||||||
tokens_a.pop()
|
|
||||||
else:
|
|
||||||
tokens_b.pop()
|
|
||||||
|
|
||||||
|
|
||||||
def simple_accuracy(preds, labels):
|
|
||||||
return (preds == labels).mean()
|
|
||||||
|
|
||||||
|
|
||||||
def acc_and_f1(preds, labels):
|
|
||||||
acc = simple_accuracy(preds, labels)
|
|
||||||
f1 = f1_score(y_true=labels, y_pred=preds)
|
|
||||||
return {
|
|
||||||
"acc": acc,
|
|
||||||
"f1": f1,
|
|
||||||
"acc_and_f1": (acc + f1) / 2,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def pearson_and_spearman(preds, labels):
|
|
||||||
pearson_corr = pearsonr(preds, labels)[0]
|
|
||||||
spearman_corr = spearmanr(preds, labels)[0]
|
|
||||||
return {
|
|
||||||
"pearson": pearson_corr,
|
|
||||||
"spearmanr": spearman_corr,
|
|
||||||
"corr": (pearson_corr + spearman_corr) / 2,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def compute_metrics(task_name, preds, labels):
|
|
||||||
assert len(preds) == len(labels)
|
|
||||||
if task_name == "cola":
|
|
||||||
return {"mcc": matthews_corrcoef(labels, preds)}
|
|
||||||
elif task_name == "sst-2":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "mrpc":
|
|
||||||
return acc_and_f1(preds, labels)
|
|
||||||
elif task_name == "sts-b":
|
|
||||||
return pearson_and_spearman(preds, labels)
|
|
||||||
elif task_name == "qqp":
|
|
||||||
return acc_and_f1(preds, labels)
|
|
||||||
elif task_name == "mnli":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "mnli-mm":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "qnli":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "rte":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "wnli":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
else:
|
|
||||||
raise KeyError(task_name)
|
|
||||||
|
|
||||||
processors = {
|
|
||||||
"cola": ColaProcessor,
|
"cola": ColaProcessor,
|
||||||
"mnli": MnliProcessor,
|
"mnli": MnliProcessor,
|
||||||
"mnli-mm": MnliMismatchedProcessor,
|
"mnli-mm": MnliMismatchedProcessor,
|
||||||
@@ -591,7 +538,7 @@ processors = {
|
|||||||
"wnli": WnliProcessor,
|
"wnli": WnliProcessor,
|
||||||
}
|
}
|
||||||
|
|
||||||
output_modes = {
|
glue_output_modes = {
|
||||||
"cola": "classification",
|
"cola": "classification",
|
||||||
"mnli": "classification",
|
"mnli": "classification",
|
||||||
"mnli-mm": "classification",
|
"mnli-mm": "classification",
|
||||||
@@ -603,15 +550,3 @@ output_modes = {
|
|||||||
"rte": "classification",
|
"rte": "classification",
|
||||||
"wnli": "classification",
|
"wnli": "classification",
|
||||||
}
|
}
|
||||||
|
|
||||||
GLUE_TASKS_NUM_LABELS = {
|
|
||||||
"cola": 2,
|
|
||||||
"mnli": 3,
|
|
||||||
"mrpc": 2,
|
|
||||||
"sst-2": 2,
|
|
||||||
"sts-b": 1,
|
|
||||||
"qqp": 2,
|
|
||||||
"qnli": 2,
|
|
||||||
"rte": 2,
|
|
||||||
"wnli": 2,
|
|
||||||
}
|
|
||||||
120
transformers/data/processors/utils.py
Normal file
120
transformers/data/processors/utils.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
|
||||||
|
class InputExample(object):
|
||||||
|
"""
|
||||||
|
A single training/test example for simple sequence classification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
guid: Unique id for the example.
|
||||||
|
text_a: string. The untokenized text of the first sequence. For single
|
||||||
|
sequence tasks, only this sequence must be specified.
|
||||||
|
text_b: (Optional) string. The untokenized text of the second sequence.
|
||||||
|
Only must be specified for sequence pair tasks.
|
||||||
|
label: (Optional) string. The label of the example. This should be
|
||||||
|
specified for train and dev examples, but not for test examples.
|
||||||
|
"""
|
||||||
|
def __init__(self, guid, text_a, text_b=None, label=None):
|
||||||
|
self.guid = guid
|
||||||
|
self.text_a = text_a
|
||||||
|
self.text_b = text_b
|
||||||
|
self.label = label
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.to_json_string())
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""Serializes this instance to a Python dictionary."""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""Serializes this instance to a JSON string."""
|
||||||
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
class InputFeatures(object):
|
||||||
|
"""
|
||||||
|
A single set of features of data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||||
|
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
|
||||||
|
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
label: Label corresponding to the input
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input_ids, attention_mask, token_type_ids, label):
|
||||||
|
self.input_ids = input_ids
|
||||||
|
self.attention_mask = attention_mask
|
||||||
|
self.token_type_ids = token_type_ids
|
||||||
|
self.label = label
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.to_json_string())
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""Serializes this instance to a Python dictionary."""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""Serializes this instance to a JSON string."""
|
||||||
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
class DataProcessor(object):
|
||||||
|
"""Base class for data converters for sequence classification data sets."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""Gets an example from a dict with tensorflow tensors
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tensor_dict: Keys and values should match the corresponding Glue
|
||||||
|
tensorflow_dataset examples.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_train_examples(self, data_dir):
|
||||||
|
"""Gets a collection of `InputExample`s for the train set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_dev_examples(self, data_dir):
|
||||||
|
"""Gets a collection of `InputExample`s for the dev set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
"""Gets the list of labels for this data set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _read_tsv(cls, input_file, quotechar=None):
|
||||||
|
"""Reads a tab separated value file."""
|
||||||
|
with open(input_file, "r", encoding="utf-8-sig") as f:
|
||||||
|
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||||
|
lines = []
|
||||||
|
for line in reader:
|
||||||
|
if sys.version_info[0] == 2:
|
||||||
|
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||||
|
lines.append(line)
|
||||||
|
return lines
|
||||||
@@ -23,6 +23,24 @@ from botocore.exceptions import ClientError
|
|||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
assert int(tf.__version__[0]) >= 2
|
||||||
|
_tf_available = True # pylint: disable=invalid-name
|
||||||
|
logger.info("TensorFlow version {} available.".format(tf.__version__))
|
||||||
|
except (ImportError, AssertionError):
|
||||||
|
_tf_available = False # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
_torch_available = True # pylint: disable=invalid-name
|
||||||
|
logger.info("PyTorch version {} available.".format(torch.__version__))
|
||||||
|
except ImportError:
|
||||||
|
_torch_available = False # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from torch.hub import _get_torch_home
|
from torch.hub import _get_torch_home
|
||||||
torch_cache_home = _get_torch_home()
|
torch_cache_home = _get_torch_home()
|
||||||
@@ -30,7 +48,7 @@ except ImportError:
|
|||||||
torch_cache_home = os.path.expanduser(
|
torch_cache_home = os.path.expanduser(
|
||||||
os.getenv('TORCH_HOME', os.path.join(
|
os.getenv('TORCH_HOME', os.path.join(
|
||||||
os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
|
os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
|
||||||
default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
|
default_cache_path = os.path.join(torch_cache_home, 'transformers')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@@ -47,12 +65,18 @@ except (AttributeError, ImportError):
|
|||||||
default_cache_path))
|
default_cache_path))
|
||||||
|
|
||||||
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
||||||
|
TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
||||||
|
|
||||||
WEIGHTS_NAME = "pytorch_model.bin"
|
WEIGHTS_NAME = "pytorch_model.bin"
|
||||||
|
TF2_WEIGHTS_NAME = 'tf_model.h5'
|
||||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||||
CONFIG_NAME = "config.json"
|
CONFIG_NAME = "config.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
def is_torch_available():
|
||||||
|
return _torch_available
|
||||||
|
|
||||||
|
def is_tf_available():
|
||||||
|
return _tf_available
|
||||||
|
|
||||||
if not six.PY2:
|
if not six.PY2:
|
||||||
def add_start_docstrings(*docstr):
|
def add_start_docstrings(*docstr):
|
||||||
@@ -83,6 +107,9 @@ def url_to_filename(url, etag=None):
|
|||||||
Convert `url` into a hashed filename in a repeatable way.
|
Convert `url` into a hashed filename in a repeatable way.
|
||||||
If `etag` is specified, append its hash to the url's, delimited
|
If `etag` is specified, append its hash to the url's, delimited
|
||||||
by a period.
|
by a period.
|
||||||
|
If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
|
||||||
|
so that TF 2.0 can identify it as a HDF5 file
|
||||||
|
(see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
|
||||||
"""
|
"""
|
||||||
url_bytes = url.encode('utf-8')
|
url_bytes = url.encode('utf-8')
|
||||||
url_hash = sha256(url_bytes)
|
url_hash = sha256(url_bytes)
|
||||||
@@ -93,6 +120,9 @@ def url_to_filename(url, etag=None):
|
|||||||
etag_hash = sha256(etag_bytes)
|
etag_hash = sha256(etag_bytes)
|
||||||
filename += '.' + etag_hash.hexdigest()
|
filename += '.' + etag_hash.hexdigest()
|
||||||
|
|
||||||
|
if url.endswith('.h5'):
|
||||||
|
filename += '.h5'
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
@@ -102,7 +132,7 @@ def filename_to_url(filename, cache_dir=None):
|
|||||||
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
|
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||||
cache_dir = str(cache_dir)
|
cache_dir = str(cache_dir)
|
||||||
|
|
||||||
@@ -133,7 +163,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
||||||
url_or_filename = str(url_or_filename)
|
url_or_filename = str(url_or_filename)
|
||||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||||
@@ -222,7 +252,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
|
|||||||
If it's not there, download it. Then return the path to the cached file.
|
If it's not there, download it. Then return the path to the cached file.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||||
cache_dir = str(cache_dir)
|
cache_dir = str(cache_dir)
|
||||||
if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
|
if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
|
||||||
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class AutoModel(object):
|
class AutoModel(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModel` is a generic model class
|
:class:`~transformers.AutoModel` is a generic model class
|
||||||
that will be instantiated as one of the base model classes of the library
|
that will be instantiated as one of the base model classes of the library
|
||||||
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -84,23 +84,23 @@ class AutoModel(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -120,7 +120,7 @@ class AutoModel(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -157,7 +157,7 @@ class AutoModel(object):
|
|||||||
|
|
||||||
class AutoModelWithLMHead(object):
|
class AutoModelWithLMHead(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
|
:class:`~transformers.AutoModelWithLMHead` is a generic model class
|
||||||
that will be instantiated as one of the language modeling model classes of the library
|
that will be instantiated as one of the language modeling model classes of the library
|
||||||
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -208,23 +208,23 @@ class AutoModelWithLMHead(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -244,7 +244,7 @@ class AutoModelWithLMHead(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -281,7 +281,7 @@ class AutoModelWithLMHead(object):
|
|||||||
|
|
||||||
class AutoModelForSequenceClassification(object):
|
class AutoModelForSequenceClassification(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
|
:class:`~transformers.AutoModelForSequenceClassification` is a generic model class
|
||||||
that will be instantiated as one of the sequence classification model classes of the library
|
that will be instantiated as one of the sequence classification model classes of the library
|
||||||
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -326,23 +326,23 @@ class AutoModelForSequenceClassification(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -362,7 +362,7 @@ class AutoModelForSequenceClassification(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -392,7 +392,7 @@ class AutoModelForSequenceClassification(object):
|
|||||||
|
|
||||||
class AutoModelForQuestionAnswering(object):
|
class AutoModelForQuestionAnswering(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
|
:class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
|
||||||
that will be instantiated as one of the question answering model classes of the library
|
that will be instantiated as one of the question answering model classes of the library
|
||||||
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -435,23 +435,23 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -471,7 +471,7 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -118,26 +118,27 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
|||||||
|
|
||||||
|
|
||||||
def gelu(x):
|
def gelu(x):
|
||||||
"""Implementation of the gelu activation function.
|
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||||
Also see https://arxiv.org/abs/1606.08415
|
Also see https://arxiv.org/abs/1606.08415
|
||||||
"""
|
"""
|
||||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||||
|
|
||||||
|
def gelu_new(x):
|
||||||
|
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
|
||||||
|
Also see https://arxiv.org/abs/1606.08415
|
||||||
|
"""
|
||||||
|
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||||
|
|
||||||
def swish(x):
|
def swish(x):
|
||||||
return x * torch.sigmoid(x)
|
return x * torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
|
||||||
|
|
||||||
|
|
||||||
try:
|
BertLayerNorm = torch.nn.LayerNorm
|
||||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
|
||||||
except (ImportError, AttributeError) as e:
|
|
||||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
|
||||||
BertLayerNorm = torch.nn.LayerNorm
|
|
||||||
|
|
||||||
class BertEmbeddings(nn.Module):
|
class BertEmbeddings(nn.Module):
|
||||||
"""Construct the embeddings from word, position and token_type embeddings.
|
"""Construct the embeddings from word, position and token_type embeddings.
|
||||||
@@ -195,7 +196,7 @@ class BertSelfAttention(nn.Module):
|
|||||||
x = x.view(*new_x_shape)
|
x = x.view(*new_x_shape)
|
||||||
return x.permute(0, 2, 1, 3)
|
return x.permute(0, 2, 1, 3)
|
||||||
|
|
||||||
def forward(self, hidden_states, attention_mask, head_mask=None):
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
mixed_query_layer = self.query(hidden_states)
|
mixed_query_layer = self.query(hidden_states)
|
||||||
mixed_key_layer = self.key(hidden_states)
|
mixed_key_layer = self.key(hidden_states)
|
||||||
mixed_value_layer = self.value(hidden_states)
|
mixed_value_layer = self.value(hidden_states)
|
||||||
@@ -207,8 +208,9 @@ class BertSelfAttention(nn.Module):
|
|||||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
if attention_mask is not None:
|
||||||
attention_scores = attention_scores + attention_mask
|
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
# Normalize the attention scores to probabilities.
|
# Normalize the attention scores to probabilities.
|
||||||
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||||
@@ -275,7 +277,7 @@ class BertAttention(nn.Module):
|
|||||||
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
||||||
self.pruned_heads = self.pruned_heads.union(heads)
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def forward(self, input_tensor, attention_mask, head_mask=None):
|
def forward(self, input_tensor, attention_mask=None, head_mask=None):
|
||||||
self_outputs = self.self(input_tensor, attention_mask, head_mask)
|
self_outputs = self.self(input_tensor, attention_mask, head_mask)
|
||||||
attention_output = self.output(self_outputs[0], input_tensor)
|
attention_output = self.output(self_outputs[0], input_tensor)
|
||||||
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
|
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
|
||||||
@@ -318,7 +320,7 @@ class BertLayer(nn.Module):
|
|||||||
self.intermediate = BertIntermediate(config)
|
self.intermediate = BertIntermediate(config)
|
||||||
self.output = BertOutput(config)
|
self.output = BertOutput(config)
|
||||||
|
|
||||||
def forward(self, hidden_states, attention_mask, head_mask=None):
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
|
attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
|
||||||
attention_output = attention_outputs[0]
|
attention_output = attention_outputs[0]
|
||||||
intermediate_output = self.intermediate(attention_output)
|
intermediate_output = self.intermediate(attention_output)
|
||||||
@@ -334,7 +336,7 @@ class BertEncoder(nn.Module):
|
|||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
|
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
|
||||||
|
|
||||||
def forward(self, hidden_states, attention_mask, head_mask=None):
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
all_hidden_states = ()
|
all_hidden_states = ()
|
||||||
all_attentions = ()
|
all_attentions = ()
|
||||||
for i, layer_module in enumerate(self.layer):
|
for i, layer_module in enumerate(self.layer):
|
||||||
@@ -480,9 +482,9 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
BERT_INPUTS_DOCSTRING = r"""
|
BERT_INPUTS_DOCSTRING = r"""
|
||||||
@@ -506,9 +508,9 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
Indices can be obtained using :class:`transformers.BertTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -372,9 +372,9 @@ DISTILBERT_START_DOCSTRING = r"""
|
|||||||
https://medium.com/huggingface/distilbert-8cf3380435b5
|
https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DISTILBERT_INPUTS_DOCSTRING = r"""
|
DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||||
@@ -649,7 +649,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
|||||||
start_positions = torch.tensor([1])
|
start_positions = torch.tensor([1])
|
||||||
end_positions = torch.tensor([3])
|
end_positions = torch.tensor([3])
|
||||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||||
loss, start_scores, end_scores = outputs[:2]
|
loss, start_scores, end_scores = outputs[:3]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
||||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
||||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
|
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
|
||||||
|
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
|
||||||
|
|
||||||
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||||
""" Load tf checkpoints in a pytorch model
|
""" Load tf checkpoints in a pytorch model
|
||||||
@@ -280,9 +281,9 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
||||||
@@ -290,9 +291,9 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices of input sequence tokens in the vocabulary.
|
Indices of input sequence tokens in the vocabulary.
|
||||||
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
|
Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
@@ -367,6 +368,13 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
input_ids = input_ids.view(-1, input_shape[-1])
|
||||||
|
if token_type_ids is not None:
|
||||||
|
token_type_ids = token_type_ids.view(-1, input_shape[-1])
|
||||||
|
if position_ids is not None:
|
||||||
|
position_ids = position_ids.view(-1, input_shape[-1])
|
||||||
|
|
||||||
if past is None:
|
if past is None:
|
||||||
past_length = 0
|
past_length = 0
|
||||||
past = [None] * len(self.h)
|
past = [None] * len(self.h)
|
||||||
@@ -378,6 +386,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
|
|
||||||
# Attention mask.
|
# Attention mask.
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
|
attention_mask = attention_mask.view(-1, input_shape[-1])
|
||||||
# We create a 3D attention mask from a 2D tensor mask.
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
@@ -407,14 +416,9 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
head_mask = [None] * self.config.n_layer
|
head_mask = [None] * self.config.n_layer
|
||||||
|
|
||||||
input_shape = input_ids.size()
|
|
||||||
input_ids = input_ids.view(-1, input_ids.size(-1))
|
|
||||||
position_ids = position_ids.view(-1, position_ids.size(-1))
|
|
||||||
|
|
||||||
inputs_embeds = self.wte(input_ids)
|
inputs_embeds = self.wte(input_ids)
|
||||||
position_embeds = self.wpe(position_ids)
|
position_embeds = self.wpe(position_ids)
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
|
|
||||||
token_type_embeds = self.wte(token_type_ids)
|
token_type_embeds = self.wte(token_type_ids)
|
||||||
else:
|
else:
|
||||||
token_type_embeds = 0
|
token_type_embeds = 0
|
||||||
@@ -490,7 +494,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||||
@@ -586,7 +590,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
||||||
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
||||||
@@ -294,9 +294,9 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
||||||
@@ -304,9 +304,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices of input sequence tokens in the vocabulary.
|
Indices of input sequence tokens in the vocabulary.
|
||||||
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -43,6 +43,9 @@ class RobertaEmbeddings(BertEmbeddings):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaEmbeddings, self).__init__(config)
|
super(RobertaEmbeddings, self).__init__(config)
|
||||||
self.padding_idx = 1
|
self.padding_idx = 1
|
||||||
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
|
||||||
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
|
||||||
|
padding_idx=self.padding_idx)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, position_ids=None):
|
def forward(self, input_ids, token_type_ids=None, position_ids=None):
|
||||||
seq_length = input_ids.size(1)
|
seq_length = input_ids.size(1)
|
||||||
@@ -77,9 +80,9 @@ ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the
|
config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
|
||||||
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ROBERTA_INPUTS_DOCSTRING = r"""
|
ROBERTA_INPUTS_DOCSTRING = r"""
|
||||||
@@ -102,8 +105,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
|||||||
RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -361,9 +364,9 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
|||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0``
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
Indices can be obtained using :class:`transformers.BertTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||||
Segment token indices to indicate first and second portions of the inputs.
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
||||||
501
transformers/modeling_tf_auto.py
Normal file
501
transformers/modeling_tf_auto.py
Normal file
@@ -0,0 +1,501 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Auto Model class. """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
|
||||||
|
from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
|
||||||
|
from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
|
||||||
|
from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
|
||||||
|
from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
|
||||||
|
from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
|
||||||
|
from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
|
||||||
|
from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
|
||||||
|
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModel(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModel` is a generic model class
|
||||||
|
that will be instantiated as one of the base model classes of the library
|
||||||
|
when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertModel (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMModel (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModel is designed to be instantiated "
|
||||||
|
"using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the base model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
||||||
|
- contains `bert`: TFTFBertModel (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMModel (XLM model)
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||||
|
return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'gpt2' in pretrained_model_name_or_path:
|
||||||
|
return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||||
|
return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
|
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModelWithLMHead(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModelWithLMHead` is a generic model class
|
||||||
|
that will be instantiated as one of the language modeling model classes of the library
|
||||||
|
when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForMaskedLM (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMWithLMHeadModel (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the language modeling model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForMaskedLM (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMWithLMHeadModel (XLM model)
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||||
|
return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'gpt2' in pretrained_model_name_or_path:
|
||||||
|
return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||||
|
return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
|
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModelForSequenceClassification(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
|
||||||
|
that will be instantiated as one of the sequence classification model classes of the library
|
||||||
|
when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForSequenceClassification (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForSequenceClassification (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the sequence classification model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForSequenceClassification (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForSequenceClassification (XLM model)
|
||||||
|
|
||||||
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModelForQuestionAnswering(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
|
||||||
|
that will be instantiated as one of the question answering model classes of the library
|
||||||
|
when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `bert`: TFBertForQuestionAnswering (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForQuestionAnswering (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the question answering model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `bert`: TFBertForQuestionAnswering (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForQuestionAnswering (XLM model)
|
||||||
|
|
||||||
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
|
||||||
1044
transformers/modeling_tf_bert.py
Normal file
1044
transformers/modeling_tf_bert.py
Normal file
File diff suppressed because it is too large
Load Diff
743
transformers/modeling_tf_distilbert.py
Normal file
743
transformers/modeling_tf_distilbert.py
Normal file
@@ -0,0 +1,743 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 DistilBERT model
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import copy
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .configuration_distilbert import DistilBertConfig
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
||||||
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
|
||||||
|
def gelu(x):
|
||||||
|
""" Gaussian Error Linear Unit.
|
||||||
|
Original Implementation of the gelu activation function in Google Bert repo when initially created.
|
||||||
|
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||||
|
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||||
|
Also see https://arxiv.org/abs/1606.08415
|
||||||
|
"""
|
||||||
|
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
|
||||||
|
return x * cdf
|
||||||
|
|
||||||
|
def gelu_new(x):
|
||||||
|
"""Gaussian Error Linear Unit.
|
||||||
|
This is a smoother version of the RELU.
|
||||||
|
Original paper: https://arxiv.org/abs/1606.08415
|
||||||
|
Args:
|
||||||
|
x: float Tensor to perform activation.
|
||||||
|
Returns:
|
||||||
|
`x` with the GELU activation applied.
|
||||||
|
"""
|
||||||
|
cdf = 0.5 * (1.0 + tf.tanh(
|
||||||
|
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
|
||||||
|
return x * cdf
|
||||||
|
|
||||||
|
def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||||
|
# build the network
|
||||||
|
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
|
||||||
|
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
||||||
|
tf_inputs = [inputs_list, attns_list]
|
||||||
|
tfo = tf_model(tf_inputs, training=False)
|
||||||
|
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
|
||||||
|
|
||||||
|
class TFEmbeddings(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFEmbeddings, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
self.dim = config.dim
|
||||||
|
self.initializer_range = config.initializer_range
|
||||||
|
self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
|
||||||
|
config.dim,
|
||||||
|
initializer_range=config.initializer_range,
|
||||||
|
name='word_embeddings') # padding_idx=0)
|
||||||
|
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
|
||||||
|
config.dim,
|
||||||
|
embeddings_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='position_embeddings')
|
||||||
|
if config.sinusoidal_pos_embds:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
"""Build shared word embedding layer """
|
||||||
|
with tf.name_scope("word_embeddings"):
|
||||||
|
# Create and initialize weights. The random normal initializer was chosen
|
||||||
|
# arbitrarily, and works well.
|
||||||
|
self.word_embeddings = self.add_weight(
|
||||||
|
"weight",
|
||||||
|
shape=[self.vocab_size, self.dim],
|
||||||
|
initializer=get_initializer(self.initializer_range))
|
||||||
|
super(TFEmbeddings, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, inputs, mode="embedding", training=False):
|
||||||
|
"""Get token embeddings of inputs.
|
||||||
|
Args:
|
||||||
|
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
|
||||||
|
mode: string, a valid value is one of "embedding" and "linear".
|
||||||
|
Returns:
|
||||||
|
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
|
||||||
|
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
|
||||||
|
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||||
|
Raises:
|
||||||
|
ValueError: if mode is not valid.
|
||||||
|
|
||||||
|
Shared weights logic adapted from
|
||||||
|
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||||
|
"""
|
||||||
|
if mode == "embedding":
|
||||||
|
return self._embedding(inputs, training=training)
|
||||||
|
elif mode == "linear":
|
||||||
|
return self._linear(inputs)
|
||||||
|
else:
|
||||||
|
raise ValueError("mode {} is not valid.".format(mode))
|
||||||
|
|
||||||
|
def _embedding(self, inputs, training=False):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
input_ids: tf.Tensor(bs, max_seq_length)
|
||||||
|
The token ids to embed.
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
embeddings: tf.Tensor(bs, max_seq_length, dim)
|
||||||
|
The embedded tokens (plus position embeddings, no token_type embeddings)
|
||||||
|
"""
|
||||||
|
if not isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs
|
||||||
|
position_ids = None
|
||||||
|
else:
|
||||||
|
input_ids, position_ids = inputs
|
||||||
|
|
||||||
|
seq_length = tf.shape(input_ids)[1]
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|
||||||
|
word_embeddings = tf.gather(self.word_embeddings, input_ids)
|
||||||
|
position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim)
|
||||||
|
|
||||||
|
embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim)
|
||||||
|
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
|
||||||
|
embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim)
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
def _linear(self, inputs):
|
||||||
|
"""Computes logits by running inputs through a linear layer.
|
||||||
|
Args:
|
||||||
|
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
|
||||||
|
Returns:
|
||||||
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
|
"""
|
||||||
|
batch_size = tf.shape(inputs)[0]
|
||||||
|
length = tf.shape(inputs)[1]
|
||||||
|
|
||||||
|
x = tf.reshape(inputs, [-1, self.dim])
|
||||||
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
|
|
||||||
|
return tf.reshape(logits, [batch_size, length, self.vocab_size])
|
||||||
|
|
||||||
|
|
||||||
|
class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFMultiHeadSelfAttention, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.n_heads = config.n_heads
|
||||||
|
self.dim = config.dim
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
assert self.dim % self.n_heads == 0
|
||||||
|
|
||||||
|
self.q_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="q_lin")
|
||||||
|
self.k_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="k_lin")
|
||||||
|
self.v_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="v_lin")
|
||||||
|
self.out_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="out_lin")
|
||||||
|
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query: tf.Tensor(bs, seq_length, dim)
|
||||||
|
key: tf.Tensor(bs, seq_length, dim)
|
||||||
|
value: tf.Tensor(bs, seq_length, dim)
|
||||||
|
mask: tf.Tensor(bs, seq_length)
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
|
||||||
|
Attention weights
|
||||||
|
context: tf.Tensor(bs, seq_length, dim)
|
||||||
|
Contextualized layer. Optional: only if `output_attentions=True`
|
||||||
|
"""
|
||||||
|
query, key, value, mask, head_mask = inputs
|
||||||
|
bs, q_length, dim = shape_list(query)
|
||||||
|
k_length = shape_list(key)[1]
|
||||||
|
# assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
|
||||||
|
# assert key.size() == value.size()
|
||||||
|
|
||||||
|
dim_per_head = self.dim // self.n_heads
|
||||||
|
|
||||||
|
assert 2 <= len(tf.shape(mask)) <= 3
|
||||||
|
causal = (len(tf.shape(mask)) == 3)
|
||||||
|
mask_reshape = [bs, 1, 1, k_length]
|
||||||
|
|
||||||
|
def shape(x):
|
||||||
|
""" separate heads """
|
||||||
|
return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
|
||||||
|
|
||||||
|
def unshape(x):
|
||||||
|
""" group heads """
|
||||||
|
return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
|
||||||
|
|
||||||
|
q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head)
|
||||||
|
k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head)
|
||||||
|
v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head)
|
||||||
|
|
||||||
|
q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
|
||||||
|
scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length)
|
||||||
|
mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen)
|
||||||
|
# scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length)
|
||||||
|
scores = scores - 1e30 * (1.0 - mask)
|
||||||
|
|
||||||
|
weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen)
|
||||||
|
weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
weights = weights * head_mask
|
||||||
|
|
||||||
|
context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
context = unshape(context) # (bs, q_length, dim)
|
||||||
|
context = self.out_lin(context) # (bs, q_length, dim)
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
return (context, weights)
|
||||||
|
else:
|
||||||
|
return (context,)
|
||||||
|
|
||||||
|
class TFFFN(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFFFN, self).__init__(**kwargs)
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="lin1")
|
||||||
|
self.lin2 = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="lin2")
|
||||||
|
assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
|
||||||
|
self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
|
||||||
|
|
||||||
|
def call(self, input, training=False):
|
||||||
|
x = self.lin1(input)
|
||||||
|
x = self.activation(x)
|
||||||
|
x = self.lin2(x)
|
||||||
|
x = self.dropout(x, training=training)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class TFTransformerBlock(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFTransformerBlock, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.n_heads = config.n_heads
|
||||||
|
self.dim = config.dim
|
||||||
|
self.hidden_dim = config.hidden_dim
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
self.activation = config.activation
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
assert config.dim % config.n_heads == 0
|
||||||
|
|
||||||
|
self.attention = TFMultiHeadSelfAttention(config, name="attention")
|
||||||
|
self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
|
||||||
|
|
||||||
|
self.ffn = TFFFN(config, name="ffn")
|
||||||
|
self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
|
||||||
|
|
||||||
|
def call(self, inputs, training=False): # removed: src_enc=None, src_len=None
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x: tf.Tensor(bs, seq_length, dim)
|
||||||
|
attn_mask: tf.Tensor(bs, seq_length)
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
|
||||||
|
The attention weights
|
||||||
|
ffn_output: tf.Tensor(bs, seq_length, dim)
|
||||||
|
The output of the transformer block contextualization.
|
||||||
|
"""
|
||||||
|
x, attn_mask, head_mask = inputs
|
||||||
|
|
||||||
|
# Self-Attention
|
||||||
|
sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
|
||||||
|
if self.output_attentions:
|
||||||
|
sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
|
||||||
|
else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
|
||||||
|
# assert type(sa_output) == tuple
|
||||||
|
sa_output = sa_output[0]
|
||||||
|
sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim)
|
||||||
|
|
||||||
|
# Feed Forward Network
|
||||||
|
ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim)
|
||||||
|
ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim)
|
||||||
|
|
||||||
|
output = (ffn_output,)
|
||||||
|
if self.output_attentions:
|
||||||
|
output = (sa_weights,) + output
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class TFTransformer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFTransformer, self).__init__(**kwargs)
|
||||||
|
self.n_layers = config.n_layers
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
|
self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
|
||||||
|
for i in range(config.n_layers)]
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x: tf.Tensor(bs, seq_length, dim)
|
||||||
|
Input sequence embedded.
|
||||||
|
attn_mask: tf.Tensor(bs, seq_length)
|
||||||
|
Attention mask on the sequence.
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
hidden_state: tf.Tensor(bs, seq_length, dim)
|
||||||
|
Sequence of hiddens states in the last (top) layer
|
||||||
|
all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
|
||||||
|
Tuple of length n_layers with the hidden states from each layer.
|
||||||
|
Optional: only if output_hidden_states=True
|
||||||
|
all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
|
||||||
|
Tuple of length n_layers with the attention weights from each layer
|
||||||
|
Optional: only if output_attentions=True
|
||||||
|
"""
|
||||||
|
x, attn_mask, head_mask = inputs
|
||||||
|
|
||||||
|
all_hidden_states = ()
|
||||||
|
all_attentions = ()
|
||||||
|
|
||||||
|
hidden_state = x
|
||||||
|
for i, layer_module in enumerate(self.layer):
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||||
|
|
||||||
|
layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training)
|
||||||
|
hidden_state = layer_outputs[-1]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
assert len(layer_outputs) == 2
|
||||||
|
attentions = layer_outputs[0]
|
||||||
|
all_attentions = all_attentions + (attentions,)
|
||||||
|
else:
|
||||||
|
assert len(layer_outputs) == 1
|
||||||
|
|
||||||
|
# Add last layer
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||||
|
|
||||||
|
outputs = (hidden_state,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFDistilBertMainLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFDistilBertMainLayer, self).__init__(**kwargs)
|
||||||
|
self.num_hidden_layers = config.num_hidden_layers
|
||||||
|
|
||||||
|
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
|
||||||
|
self.transformer = TFTransformer(config, name="transformer") # Encoder
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, attention_mask=None, head_mask=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
|
head_mask = inputs[2] if len(inputs) > 2 else head_mask
|
||||||
|
assert len(inputs) <= 3, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
assert len(inputs) <= 3, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length)
|
||||||
|
attention_mask = tf.cast(attention_mask, dtype=tf.float32)
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if head_mask is not None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
|
||||||
|
embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim)
|
||||||
|
tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
|
||||||
|
|
||||||
|
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
|
||||||
|
class TFDistilBertPreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for downloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = DistilBertConfig
|
||||||
|
pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_pt_weights = load_distilbert_pt_weights_in_tf2
|
||||||
|
base_model_prefix = "distilbert"
|
||||||
|
|
||||||
|
|
||||||
|
DISTILBERT_START_DOCSTRING = r"""
|
||||||
|
DistilBERT is a small, fast, cheap and light Transformer model
|
||||||
|
trained by distilling Bert base. It has 40% less parameters than
|
||||||
|
`bert-base-uncased`, runs 60% faster while preserving over 95% of
|
||||||
|
Bert's performances as measured on the GLUE language understanding benchmark.
|
||||||
|
|
||||||
|
Here are the differences between the interface of Bert and DistilBert:
|
||||||
|
|
||||||
|
- DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
|
||||||
|
- DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
|
||||||
|
|
||||||
|
For more information on DistilBERT, please refer to our
|
||||||
|
`detailed blog post`_
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`detailed blog post`:
|
||||||
|
https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
|
||||||
|
|
||||||
|
For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertModel(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import DistilBertTokenizer, TFDistilBertModel
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.distilbert(inputs, **kwargs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFDistilBertLMHead(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
|
super(TFDistilBertLMHead, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
# The output weights are the same as the input embeddings, but there is
|
||||||
|
# an output-only bias for each token.
|
||||||
|
self.input_embeddings = input_embeddings
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
self.bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='bias')
|
||||||
|
super(TFDistilBertLMHead, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, hidden_states):
|
||||||
|
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||||
|
hidden_states = hidden_states + self.bias
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
prediction_scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
|
self.vocab_transform = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="vocab_transform")
|
||||||
|
self.act = tf.keras.layers.Activation(gelu)
|
||||||
|
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
|
||||||
|
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
distilbert_output = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
hidden_states = distilbert_output[0] # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.vocab_projector(prediction_logits)
|
||||||
|
|
||||||
|
outputs = (prediction_logits,) + distilbert_output[1:]
|
||||||
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||||
|
the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import BertTokenizer, TFDistilBertForSequenceClassification
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
|
self.pre_classifier = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
activation='relu',
|
||||||
|
name="pre_classifier")
|
||||||
|
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="classifier")
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
distilbert_output = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
||||||
|
pooled_output = hidden_state[:, 0] # (bs, dim)
|
||||||
|
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
||||||
|
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) # (bs, dim)
|
||||||
|
logits = self.classifier(pooled_output) # (bs, dim)
|
||||||
|
|
||||||
|
outputs = (logits,) + distilbert_output[1:]
|
||||||
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||||
|
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-start scores (before SoftMax).
|
||||||
|
**end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-end scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import BertTokenizer, TFDistilBertForQuestionAnswering
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
start_scores, end_scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
|
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='qa_outputs')
|
||||||
|
assert config.num_labels == 2
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
distilbert_output = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
||||||
|
hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False)) # (bs, max_query_len, dim)
|
||||||
|
logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2)
|
||||||
|
start_logits, end_logits = tf.split(logits, 2, axis=-1)
|
||||||
|
start_logits = tf.squeeze(start_logits, axis=-1)
|
||||||
|
end_logits = tf.squeeze(end_logits, axis=-1)
|
||||||
|
|
||||||
|
outputs = (start_logits, end_logits,) + distilbert_output[1:]
|
||||||
|
return outputs # start_logits, end_logits, (hidden_states), (attentions)
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user