Merge branch 'master' of https://github.com/danai-antoniou/pytorch-transformers into add-duplicate-tokens-error
This commit is contained in:
@@ -1,33 +1,77 @@
|
|||||||
version: 2
|
version: 2
|
||||||
jobs:
|
jobs:
|
||||||
build_py3:
|
build_py3_torch_and_tf:
|
||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:3.5
|
- image: circleci/python:3.5
|
||||||
resource_class: xlarge
|
resource_class: xlarge
|
||||||
parallelism: 1
|
parallelism: 1
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
|
- run: sudo pip install torch
|
||||||
|
- run: sudo pip install tensorflow==2.0.0-rc0
|
||||||
- run: sudo pip install --progress-bar off .
|
- run: sudo pip install --progress-bar off .
|
||||||
- run: sudo pip install pytest codecov pytest-cov
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
- run: sudo pip install tensorboardX scikit-learn
|
- run: sudo pip install tensorboardX scikit-learn
|
||||||
- run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
|
- run: codecov
|
||||||
|
build_py3_torch:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.5
|
||||||
|
resource_class: xlarge
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install torch
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
|
- run: sudo pip install tensorboardX scikit-learn
|
||||||
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
- run: python -m pytest -sv ./examples/
|
- run: python -m pytest -sv ./examples/
|
||||||
- run: codecov
|
- run: codecov
|
||||||
build_py2:
|
build_py3_tf:
|
||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.5
|
||||||
|
resource_class: xlarge
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install tensorflow==2.0.0-rc0
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
|
- run: sudo pip install tensorboardX scikit-learn
|
||||||
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
|
- run: codecov
|
||||||
|
build_py2_torch:
|
||||||
|
working_directory: ~/transformers
|
||||||
resource_class: large
|
resource_class: large
|
||||||
parallelism: 1
|
parallelism: 1
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:2.7
|
- image: circleci/python:2.7
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
|
- run: sudo pip install torch
|
||||||
- run: sudo pip install --progress-bar off .
|
- run: sudo pip install --progress-bar off .
|
||||||
- run: sudo pip install pytest codecov pytest-cov
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
- run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
|
- run: codecov
|
||||||
|
build_py2_tf:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
resource_class: large
|
||||||
|
parallelism: 1
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:2.7
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install tensorflow==2.0.0-rc0
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
- run: codecov
|
- run: codecov
|
||||||
deploy_doc:
|
deploy_doc:
|
||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:3.5
|
- image: circleci/python:3.5
|
||||||
steps:
|
steps:
|
||||||
@@ -37,7 +81,6 @@ jobs:
|
|||||||
- checkout
|
- checkout
|
||||||
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||||
- run: sudo pip install --progress-bar off -r requirements.txt
|
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||||
- run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
|
|
||||||
- run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
- run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
||||||
workflow_filters: &workflow_filters
|
workflow_filters: &workflow_filters
|
||||||
filters:
|
filters:
|
||||||
@@ -48,6 +91,9 @@ workflows:
|
|||||||
version: 2
|
version: 2
|
||||||
build_and_test:
|
build_and_test:
|
||||||
jobs:
|
jobs:
|
||||||
- build_py3
|
- build_py3_torch_and_tf
|
||||||
- build_py2
|
- build_py3_torch
|
||||||
|
- build_py3_tf
|
||||||
|
- build_py2_torch
|
||||||
|
- build_py2_tf
|
||||||
- deploy_doc: *workflow_filters
|
- deploy_doc: *workflow_filters
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
[run]
|
[run]
|
||||||
source=pytorch_transformers
|
source=transformers
|
||||||
omit =
|
omit =
|
||||||
# skip convertion scripts from testing for now
|
# skip convertion scripts from testing for now
|
||||||
*/convert_*
|
*/convert_*
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/migration.md
vendored
2
.github/ISSUE_TEMPLATE/migration.md
vendored
@@ -1,6 +1,6 @@
|
|||||||
---
|
---
|
||||||
name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
|
name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
|
||||||
about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
|
about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
|
||||||
---
|
---
|
||||||
|
|
||||||
## 📚 Migration
|
## 📚 Migration
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -130,5 +130,5 @@ runs
|
|||||||
examples/runs
|
examples/runs
|
||||||
|
|
||||||
# data
|
# data
|
||||||
data
|
/data
|
||||||
serialization_dir
|
serialization_dir
|
||||||
247
README.md
247
README.md
@@ -1,47 +1,86 @@
|
|||||||
# 👾 PyTorch-Transformers
|
<p align="center">
|
||||||
|
<br>
|
||||||
|
<img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
|
||||||
|
<br>
|
||||||
|
<p>
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||||
|
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
|
||||||
|
</a>
|
||||||
|
<a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
|
||||||
|
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
|
||||||
|
</a>
|
||||||
|
<a href="https://huggingface.co/transformers/index.html">
|
||||||
|
<img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
|
||||||
|
</a>
|
||||||
|
<a href="https://github.com/huggingface/transformers/releases">
|
||||||
|
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
|
||||||
|
</a>
|
||||||
|
</p>
|
||||||
|
|
||||||
[](https://circleci.com/gh/huggingface/pytorch-transformers)
|
<h3 align="center">
|
||||||
|
<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
|
||||||
|
</h3>
|
||||||
|
|
||||||
PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
|
🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
|
||||||
|
|
||||||
The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
|
### Features
|
||||||
|
|
||||||
1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
- As easy to use as pytorch-transformers
|
||||||
2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
- As powerful and concise as Keras
|
||||||
3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
- High performance on NLU and NLG tasks
|
||||||
4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
- Low barrier to entry for educators and practitioners
|
||||||
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
|
||||||
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
State-of-the-art NLP for everyone
|
||||||
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
- Deep learning researchers
|
||||||
8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
|
- Hands-on practitioners
|
||||||
) by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
- AI/ML/NLP teachers and educators
|
||||||
|
|
||||||
|
Lower compute costs, smaller carbon footprint
|
||||||
|
- Researchers can share trained models instead of always retraining
|
||||||
|
- Practitioners can reduce compute time and production costs
|
||||||
|
- 8 architectures with over 30 pretrained models, some in more than 100 languages
|
||||||
|
|
||||||
|
Choose the right framework for every part of a model's lifetime
|
||||||
|
- Train state-of-the-art models in 3 lines of code
|
||||||
|
- Deep interoperability between TensorFlow 2.0 and PyTorch models
|
||||||
|
- Move a single model between TF2.0/PyTorch frameworks at will
|
||||||
|
- Seamlessly pick the right framework for training, evaluation, production
|
||||||
|
|
||||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
|
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
|-|-|
|
|-|-|
|
||||||
| [Installation](#installation) | How to install the package |
|
| [Installation](#installation) | How to install the package |
|
||||||
|
| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
|
||||||
| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
|
| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
|
||||||
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
||||||
|
| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
|
||||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
|
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||||
| [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |
|
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||||
|
| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.0.0+
|
This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
|
||||||
|
|
||||||
### With pip
|
### With pip
|
||||||
|
|
||||||
PyTorch-Transformers can be installed by pip as follows:
|
First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
|
||||||
|
Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
|
||||||
|
|
||||||
|
When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install pytorch-transformers
|
pip install transformers
|
||||||
```
|
```
|
||||||
|
|
||||||
### From source
|
### From source
|
||||||
|
|
||||||
Clone the repository and run:
|
Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
|
||||||
|
Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
|
||||||
|
|
||||||
|
When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install [--editable] .
|
pip install [--editable] .
|
||||||
@@ -49,14 +88,16 @@ pip install [--editable] .
|
|||||||
|
|
||||||
### Tests
|
### Tests
|
||||||
|
|
||||||
A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
|
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||||
|
|
||||||
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
|
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
||||||
|
|
||||||
You can run the tests from the root of the cloned repository with the commands:
|
You can run the tests from the root of the cloned repository with the commands:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m pytest -sv ./pytorch_transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -66,8 +107,23 @@ You should check out our [`swift-coreml-transformers`](https://github.com/huggin
|
|||||||
|
|
||||||
It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
|
It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
|
||||||
|
|
||||||
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
|
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
|
||||||
or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
|
|
||||||
|
## Model architectures
|
||||||
|
|
||||||
|
🤗 Transformers currently provides 8 NLU/NLG architectures:
|
||||||
|
|
||||||
|
1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
|
2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
|
3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
|
4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
|
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
|
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
|
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
|
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||||
|
) by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
||||||
|
|
||||||
|
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||||
|
|
||||||
## Online demo
|
## Online demo
|
||||||
|
|
||||||
@@ -80,22 +136,25 @@ You can use it to experiment with completions generated by `GPT2Model`, `Transfo
|
|||||||
|
|
||||||
## Quick tour
|
## Quick tour
|
||||||
|
|
||||||
Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
|
Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import *
|
from transformers import *
|
||||||
|
|
||||||
# PyTorch-Transformers has a unified API
|
# Transformers has a unified API
|
||||||
# for 7 transformer architectures and 30 pretrained weights.
|
# for 8 transformer architectures and 30 pretrained weights.
|
||||||
# Model | Tokenizer | Pretrained weights shortcut
|
# Model | Tokenizer | Pretrained weights shortcut
|
||||||
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
||||||
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
|
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
|
||||||
(GPT2Model, GPT2Tokenizer, 'gpt2'),
|
(GPT2Model, GPT2Tokenizer, 'gpt2'),
|
||||||
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
|
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
|
||||||
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
|
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
|
||||||
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
|
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
|
||||||
(RobertaModel, RobertaTokenizer, 'roberta-base')]
|
(DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
|
||||||
|
(RobertaModel, RobertaTokenizer, 'roberta-base')]
|
||||||
|
|
||||||
|
# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
|
||||||
|
|
||||||
# Let's encode some text in a sequence of hidden-states using each model:
|
# Let's encode some text in a sequence of hidden-states using each model:
|
||||||
for model_class, tokenizer_class, pretrained_weights in MODELS:
|
for model_class, tokenizer_class, pretrained_weights in MODELS:
|
||||||
@@ -121,24 +180,71 @@ for model_class in BERT_MODEL_CLASSES:
|
|||||||
# Load pretrained model/tokenizer
|
# Load pretrained model/tokenizer
|
||||||
model = model_class.from_pretrained('bert-base-uncased')
|
model = model_class.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
# Models can return full list of hidden-states & attentions weights at each layer
|
# Models can return full list of hidden-states & attentions weights at each layer
|
||||||
model = model_class.from_pretrained(pretrained_weights,
|
model = model_class.from_pretrained(pretrained_weights,
|
||||||
output_hidden_states=True,
|
output_hidden_states=True,
|
||||||
output_attentions=True)
|
output_attentions=True)
|
||||||
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
|
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
|
||||||
all_hidden_states, all_attentions = model(input_ids)[-2:]
|
all_hidden_states, all_attentions = model(input_ids)[-2:]
|
||||||
|
|
||||||
# Models are compatible with Torchscript
|
# Models are compatible with Torchscript
|
||||||
model = model_class.from_pretrained(pretrained_weights, torchscript=True)
|
model = model_class.from_pretrained(pretrained_weights, torchscript=True)
|
||||||
traced_model = torch.jit.trace(model, (input_ids,))
|
traced_model = torch.jit.trace(model, (input_ids,))
|
||||||
|
|
||||||
# Simple serialization for models and tokenizers
|
# Simple serialization for models and tokenizers
|
||||||
model.save_pretrained('./directory/to/save/') # save
|
model.save_pretrained('./directory/to/save/') # save
|
||||||
model = model_class.from_pretrained('./directory/to/save/') # re-load
|
model = model_class.from_pretrained('./directory/to/save/') # re-load
|
||||||
tokenizer.save_pretrained('./directory/to/save/') # save
|
tokenizer.save_pretrained('./directory/to/save/') # save
|
||||||
tokenizer = tokenizer_class.from_pretrained('./directory/to/save/') # re-load
|
tokenizer = BertTokenizer.from_pretrained('./directory/to/save/') # re-load
|
||||||
|
|
||||||
# SOTA examples for GLUE, SQUAD, text generation...
|
# SOTA examples for GLUE, SQUAD, text generation...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick tour TF 2.0 training and PyTorch interoperability
|
||||||
|
|
||||||
|
Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow_datasets
|
||||||
|
from transformers import *
|
||||||
|
|
||||||
|
# Load dataset, tokenizer, model from pretrained model/vocabulary
|
||||||
|
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||||
|
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
|
||||||
|
data = tensorflow_datasets.load('glue/mrpc')
|
||||||
|
|
||||||
|
# Prepare dataset for GLUE as a tf.data.Dataset instance
|
||||||
|
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
|
||||||
|
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
|
||||||
|
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
|
||||||
|
valid_dataset = valid_dataset.batch(64)
|
||||||
|
|
||||||
|
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||||
|
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
|
||||||
|
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||||
|
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
||||||
|
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
|
||||||
|
|
||||||
|
# Train and evaluate using tf.keras.Model.fit()
|
||||||
|
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
|
||||||
|
validation_data=valid_dataset, validation_steps=7)
|
||||||
|
|
||||||
|
# Load the TensorFlow model in PyTorch for inspection
|
||||||
|
model.save_pretrained('./save/')
|
||||||
|
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
|
||||||
|
|
||||||
|
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
|
||||||
|
sentence_0 = "This research was consistent with his findings."
|
||||||
|
sentence_1 = "His findings were compatible with this research."
|
||||||
|
sentence_2 = "His findings were not compatible with this research."
|
||||||
|
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
|
||||||
|
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
|
||||||
|
|
||||||
|
pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
|
||||||
|
pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
|
||||||
|
print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
|
||||||
|
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quick tour of the fine-tuning/usage scripts
|
## Quick tour of the fine-tuning/usage scripts
|
||||||
@@ -288,7 +394,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s
|
|||||||
### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
|
### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
|
||||||
|
|
||||||
A conditional generation script is also included to generate text from a prompt.
|
A conditional generation script is also included to generate text from a prompt.
|
||||||
The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
|
The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
|
||||||
|
|
||||||
Here is how to run the script with the small version of OpenAI GPT-2 model:
|
Here is how to run the script with the small version of OpenAI GPT-2 model:
|
||||||
|
|
||||||
@@ -299,19 +405,32 @@ python ./examples/run_generation.py \
|
|||||||
--model_name_or_path=gpt2 \
|
--model_name_or_path=gpt2 \
|
||||||
```
|
```
|
||||||
|
|
||||||
## Migrating from pytorch-pretrained-bert to pytorch-transformers
|
## Migrating from pytorch-transformers to transformers
|
||||||
|
|
||||||
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
|
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
|
||||||
|
|
||||||
|
### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
|
||||||
|
|
||||||
|
To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
|
||||||
|
|
||||||
|
If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
|
||||||
|
|
||||||
|
If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
|
||||||
|
|
||||||
|
|
||||||
|
## Migrating from pytorch-pretrained-bert to transformers
|
||||||
|
|
||||||
|
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
|
||||||
|
|
||||||
### Models always output `tuples`
|
### Models always output `tuples`
|
||||||
|
|
||||||
The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
||||||
|
|
||||||
The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
|
The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||||
|
|
||||||
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
||||||
|
|
||||||
Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
|
Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Let's load our model
|
# Let's load our model
|
||||||
@@ -320,11 +439,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
|||||||
# If you used to have this line in pytorch-pretrained-bert:
|
# If you used to have this line in pytorch-pretrained-bert:
|
||||||
loss = model(input_ids, labels=labels)
|
loss = model(input_ids, labels=labels)
|
||||||
|
|
||||||
# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
|
# Now just use this line in transformers to extract the loss from the output tuple:
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss = outputs[0]
|
loss = outputs[0]
|
||||||
|
|
||||||
# In pytorch-transformers you can also have access to the logits:
|
# In transformers you can also have access to the logits:
|
||||||
loss, logits = outputs[:2]
|
loss, logits = outputs[:2]
|
||||||
|
|
||||||
# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
||||||
@@ -333,13 +452,17 @@ outputs = model(input_ids, labels=labels)
|
|||||||
loss, logits, attentions = outputs
|
loss, logits, attentions = outputs
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Using hidden states
|
||||||
|
|
||||||
|
By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
|
||||||
|
|
||||||
### Serialization
|
### Serialization
|
||||||
|
|
||||||
Breaking change in the `from_pretrained()`method:
|
Breaking change in the `from_pretrained()` method:
|
||||||
|
|
||||||
1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
|
1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
|
||||||
|
|
||||||
2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
|
2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
|
||||||
|
|
||||||
Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
|
Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
|
||||||
|
|
||||||
@@ -396,7 +519,7 @@ for batch in train_data:
|
|||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
@@ -411,4 +534,4 @@ for batch in train_data:
|
|||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
|
At the moment, there is no paper associated with Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
|
||||||
|
|||||||
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
|
|||||||
|
|
||||||
RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
|
RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
|
||||||
|
|
||||||
RUN pip install pytorch_transformers
|
RUN pip install transformers
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
@@ -34,11 +34,11 @@ pip install recommonmark
|
|||||||
|
|
||||||
## Building the documentation
|
## Building the documentation
|
||||||
|
|
||||||
Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig
|
Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
|
||||||
command to generate it:
|
command to generate it:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ln -s ../../examples/README.md source/examples.md
|
ln -s ../../examples/README.md examples.md
|
||||||
```
|
```
|
||||||
|
|
||||||
Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
|
Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
|
||||||
|
|||||||
@@ -26,4 +26,7 @@ sphinxcontrib-jsmath==1.0.1
|
|||||||
sphinxcontrib-qthelp==1.0.2
|
sphinxcontrib-qthelp==1.0.2
|
||||||
sphinxcontrib-serializinghtml==1.1.3
|
sphinxcontrib-serializinghtml==1.1.3
|
||||||
urllib3==1.25.3
|
urllib3==1.25.3
|
||||||
sphinx-markdown-tables==0.0.9
|
sphinx-markdown-tables==0.0.9
|
||||||
|
numpy==1.17.2
|
||||||
|
tensorflow==2.0.0rc2
|
||||||
|
torch==1.2.0
|
||||||
File diff suppressed because one or more lines are too long
@@ -15,4 +15,4 @@ In order to help this new field develop, we have included a few additional featu
|
|||||||
* accessing all the attention weights for each head of BERT/GPT/GPT-2,
|
* accessing all the attention weights for each head of BERT/GPT/GPT-2,
|
||||||
* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
|
* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
|
||||||
|
|
||||||
To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
|
To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
|
||||||
|
|||||||
@@ -19,14 +19,14 @@ sys.path.insert(0, os.path.abspath('../..'))
|
|||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
project = u'pytorch-transformers'
|
project = u'transformers'
|
||||||
copyright = u'2019, huggingface'
|
copyright = u'2019, huggingface'
|
||||||
author = u'huggingface'
|
author = u'huggingface'
|
||||||
|
|
||||||
# The short X.Y version
|
# The short X.Y version
|
||||||
version = u''
|
version = u''
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = u'1.2.0'
|
release = u'2.0.0'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
@@ -109,7 +109,7 @@ html_static_path = ['_static']
|
|||||||
# -- Options for HTMLHelp output ---------------------------------------------
|
# -- Options for HTMLHelp output ---------------------------------------------
|
||||||
|
|
||||||
# Output file base name for HTML help builder.
|
# Output file base name for HTML help builder.
|
||||||
htmlhelp_basename = 'pytorch-transformersdoc'
|
htmlhelp_basename = 'transformersdoc'
|
||||||
|
|
||||||
|
|
||||||
# -- Options for LaTeX output ------------------------------------------------
|
# -- Options for LaTeX output ------------------------------------------------
|
||||||
@@ -136,7 +136,7 @@ latex_elements = {
|
|||||||
# (source start file, target name, title,
|
# (source start file, target name, title,
|
||||||
# author, documentclass [howto, manual, or own class]).
|
# author, documentclass [howto, manual, or own class]).
|
||||||
latex_documents = [
|
latex_documents = [
|
||||||
(master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
|
(master_doc, 'transformers.tex', u'transformers Documentation',
|
||||||
u'huggingface', 'manual'),
|
u'huggingface', 'manual'),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -146,7 +146,7 @@ latex_documents = [
|
|||||||
# One entry per manual page. List of tuples
|
# One entry per manual page. List of tuples
|
||||||
# (source start file, name, description, authors, manual section).
|
# (source start file, name, description, authors, manual section).
|
||||||
man_pages = [
|
man_pages = [
|
||||||
(master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
|
(master_doc, 'transformers', u'transformers Documentation',
|
||||||
[author], 1)
|
[author], 1)
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -157,8 +157,8 @@ man_pages = [
|
|||||||
# (source start file, target name, title, author,
|
# (source start file, target name, title, author,
|
||||||
# dir menu entry, description, category)
|
# dir menu entry, description, category)
|
||||||
texinfo_documents = [
|
texinfo_documents = [
|
||||||
(master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
|
(master_doc, 'transformers', u'transformers Documentation',
|
||||||
author, 'pytorch-transformers', 'One line description of project.',
|
author, 'transformers', 'One line description of project.',
|
||||||
'Miscellaneous'),
|
'Miscellaneous'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
|
|||||||
BERT
|
BERT
|
||||||
^^^^
|
^^^^
|
||||||
|
|
||||||
You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
|
You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
|
||||||
|
|
||||||
This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
|
This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
|
||||||
|
|
||||||
@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
|
|||||||
|
|
||||||
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
||||||
|
|
||||||
pytorch_transformers bert \
|
transformers bert \
|
||||||
$BERT_BASE_DIR/bert_model.ckpt \
|
$BERT_BASE_DIR/bert_model.ckpt \
|
||||||
$BERT_BASE_DIR/bert_config.json \
|
$BERT_BASE_DIR/bert_config.json \
|
||||||
$BERT_BASE_DIR/pytorch_model.bin
|
$BERT_BASE_DIR/pytorch_model.bin
|
||||||
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
|
|||||||
|
|
||||||
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
||||||
|
|
||||||
pytorch_transformers gpt \
|
transformers gpt \
|
||||||
$OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
$OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
[OPENAI_GPT_CONFIG]
|
[OPENAI_GPT_CONFIG]
|
||||||
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
|
|||||||
|
|
||||||
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
||||||
|
|
||||||
pytorch_transformers gpt2 \
|
transformers gpt2 \
|
||||||
$OPENAI_GPT2_CHECKPOINT_PATH \
|
$OPENAI_GPT2_CHECKPOINT_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
[OPENAI_GPT2_CONFIG]
|
[OPENAI_GPT2_CONFIG]
|
||||||
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
|
|||||||
|
|
||||||
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
||||||
|
|
||||||
pytorch_transformers transfo_xl \
|
transformers transfo_xl \
|
||||||
$TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
$TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
[TRANSFO_XL_CONFIG]
|
[TRANSFO_XL_CONFIG]
|
||||||
@@ -80,7 +80,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
|
|||||||
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
||||||
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
||||||
|
|
||||||
pytorch_transformers xlnet \
|
transformers xlnet \
|
||||||
$TRANSFO_XL_CHECKPOINT_PATH \
|
$TRANSFO_XL_CHECKPOINT_PATH \
|
||||||
$TRANSFO_XL_CONFIG_PATH \
|
$TRANSFO_XL_CONFIG_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
@@ -96,6 +96,6 @@ Here is an example of the conversion process for a pre-trained XLM model:
|
|||||||
|
|
||||||
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
||||||
|
|
||||||
pytorch_transformers xlm \
|
transformers xlm \
|
||||||
$XLM_CHECKPOINT_PATH \
|
$XLM_CHECKPOINT_PATH \
|
||||||
$PYTORCH_DUMP_OUTPUT \
|
$PYTORCH_DUMP_OUTPUT \
|
||||||
|
|||||||
1
docs/source/examples.md
Symbolic link
1
docs/source/examples.md
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
../../examples/README.md
|
||||||
BIN
docs/source/imgs/transformers_logo_name.png
Normal file
BIN
docs/source/imgs/transformers_logo_name.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 8.7 KiB |
@@ -1,9 +1,43 @@
|
|||||||
Pytorch-Transformers
|
Transformers
|
||||||
================================================================================================================================================
|
================================================================================================================================================
|
||||||
|
|
||||||
PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
|
🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
|
||||||
|
(BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
|
||||||
|
(NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
|
||||||
|
|
||||||
The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
|
This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
|
||||||
|
|
||||||
|
Features
|
||||||
|
---------------------------------------------------
|
||||||
|
|
||||||
|
- As easy to use as pytorch-transformers
|
||||||
|
- As powerful and concise as Keras
|
||||||
|
- High performance on NLU and NLG tasks
|
||||||
|
- Low barrier to entry for educators and practitioners
|
||||||
|
|
||||||
|
State-of-the-art NLP for everyone:
|
||||||
|
|
||||||
|
- Deep learning researchers
|
||||||
|
- Hands-on practitioners
|
||||||
|
- AI/ML/NLP teachers and educators
|
||||||
|
|
||||||
|
Lower compute costs, smaller carbon footprint:
|
||||||
|
|
||||||
|
- Researchers can share trained models instead of always retraining
|
||||||
|
- Practitioners can reduce compute time and production costs
|
||||||
|
- 8 architectures with over 30 pretrained models, some in more than 100 languages
|
||||||
|
|
||||||
|
Choose the right framework for every part of a model's lifetime:
|
||||||
|
|
||||||
|
- Train state-of-the-art models in 3 lines of code
|
||||||
|
- Deep interoperability between TensorFlow 2.0 and PyTorch models
|
||||||
|
- Move a single model between TF2.0/PyTorch frameworks at will
|
||||||
|
- Seamlessly pick the right framework for training, evaluation, production
|
||||||
|
|
||||||
|
Contents
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
|
||||||
|
|
||||||
1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
@@ -12,7 +46,8 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
|||||||
5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
||||||
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
@@ -37,6 +72,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
|||||||
main_classes/model
|
main_classes/model
|
||||||
main_classes/tokenizer
|
main_classes/tokenizer
|
||||||
main_classes/optimizer_schedules
|
main_classes/optimizer_schedules
|
||||||
|
main_classes/processors
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
Installation
|
Installation
|
||||||
================================================
|
================================================
|
||||||
|
|
||||||
PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
|
Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
|
||||||
|
|
||||||
With pip
|
With pip
|
||||||
^^^^^^^^
|
^^^^^^^^
|
||||||
@@ -10,7 +10,7 @@ PyTorch Transformers can be installed using pip as follows:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
pip install pytorch-transformers
|
pip install transformers
|
||||||
|
|
||||||
From source
|
From source
|
||||||
^^^^^^^^^^^
|
^^^^^^^^^^^
|
||||||
@@ -19,15 +19,15 @@ To install from source, clone the repository and install with:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
git clone https://github.com/huggingface/pytorch-transformers.git
|
git clone https://github.com/huggingface/transformers.git
|
||||||
cd pytorch-transformers
|
cd transformers
|
||||||
pip install [--editable] .
|
pip install [--editable] .
|
||||||
|
|
||||||
|
|
||||||
Tests
|
Tests
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
|
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
|
||||||
|
|
||||||
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ Run all the tests from the root of the cloned repository with the commands:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
python -m pytest -sv ./pytorch_transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,5 +6,5 @@ The base class ``PretrainedConfig`` implements the common methods for loading/sa
|
|||||||
``PretrainedConfig``
|
``PretrainedConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.PretrainedConfig
|
.. autoclass:: transformers.PretrainedConfig
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -11,5 +11,11 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
|
|||||||
``PreTrainedModel``
|
``PreTrainedModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.PreTrainedModel
|
.. autoclass:: transformers.PreTrainedModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
``TFPreTrainedModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFPreTrainedModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ The ``.optimization`` module provides:
|
|||||||
``AdamW``
|
``AdamW``
|
||||||
~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AdamW
|
.. autoclass:: transformers.AdamW
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
Schedules
|
Schedules
|
||||||
@@ -18,11 +18,11 @@ Schedules
|
|||||||
Learning Rate Schedules
|
Learning Rate Schedules
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.ConstantLRSchedule
|
.. autoclass:: transformers.ConstantLRSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupConstantSchedule
|
.. autoclass:: transformers.WarmupConstantSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_constant_schedule.png
|
.. image:: /imgs/warmup_constant_schedule.png
|
||||||
@@ -30,7 +30,7 @@ Learning Rate Schedules
|
|||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupCosineSchedule
|
.. autoclass:: transformers.WarmupCosineSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_schedule.png
|
.. image:: /imgs/warmup_cosine_schedule.png
|
||||||
@@ -38,7 +38,7 @@ Learning Rate Schedules
|
|||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
|
.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||||
@@ -47,7 +47,7 @@ Learning Rate Schedules
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.WarmupLinearSchedule
|
.. autoclass:: transformers.WarmupLinearSchedule
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_linear_schedule.png
|
.. image:: /imgs/warmup_linear_schedule.png
|
||||||
|
|||||||
58
docs/source/main_classes/processors.rst
Normal file
58
docs/source/main_classes/processors.rst
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
Processors
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
This library includes processors for several traditional tasks. These processors can be used to process a dataset into
|
||||||
|
examples that can be fed to a model.
|
||||||
|
|
||||||
|
Processors
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
All processors follow the same architecture which is that of the
|
||||||
|
:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
|
||||||
|
of :class:`~transformers.data.processors.utils.InputExample`. These
|
||||||
|
:class:`~transformers.data.processors.utils.InputExample` can be converted to
|
||||||
|
:class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
|
||||||
|
|
||||||
|
.. autoclass:: transformers.data.processors.utils.DataProcessor
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
.. autoclass:: transformers.data.processors.utils.InputExample
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
.. autoclass:: transformers.data.processors.utils.InputFeatures
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
GLUE
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
|
||||||
|
the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
|
||||||
|
`GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
|
||||||
|
|
||||||
|
This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
|
||||||
|
CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
|
||||||
|
|
||||||
|
Those processors are:
|
||||||
|
- :class:`~transformers.data.processors.utils.MrpcProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.MnliProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.Sst2Processor`
|
||||||
|
- :class:`~transformers.data.processors.utils.StsbProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.QqpProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.QnliProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.RteProcessor`
|
||||||
|
- :class:`~transformers.data.processors.utils.WnliProcessor`
|
||||||
|
|
||||||
|
Additionally, the following method can be used to load values from a data file and convert them to a list of
|
||||||
|
:class:`~transformers.data.processors.utils.InputExample`.
|
||||||
|
|
||||||
|
.. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
|
||||||
|
|
||||||
|
Example usage
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
An example using these processors is given in the
|
||||||
|
`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||||
@@ -12,5 +12,5 @@ The base class ``PreTrainedTokenizer`` implements the common methods for loading
|
|||||||
``PreTrainedTokenizer``
|
``PreTrainedTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.PreTrainedTokenizer
|
.. autoclass:: transformers.PreTrainedTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
# Migrating from pytorch-pretrained-bert
|
# Migrating from pytorch-pretrained-bert
|
||||||
|
|
||||||
|
|
||||||
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
|
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
|
||||||
|
|
||||||
### Models always output `tuples`
|
### Models always output `tuples`
|
||||||
|
|
||||||
The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
||||||
|
|
||||||
The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
|
The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||||
|
|
||||||
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
||||||
|
|
||||||
Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
|
Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Let's load our model
|
# Let's load our model
|
||||||
@@ -20,11 +20,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
|||||||
# If you used to have this line in pytorch-pretrained-bert:
|
# If you used to have this line in pytorch-pretrained-bert:
|
||||||
loss = model(input_ids, labels=labels)
|
loss = model(input_ids, labels=labels)
|
||||||
|
|
||||||
# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
|
# Now just use this line in transformers to extract the loss from the output tuple:
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss = outputs[0]
|
loss = outputs[0]
|
||||||
|
|
||||||
# In pytorch-transformers you can also have access to the logits:
|
# In transformers you can also have access to the logits:
|
||||||
loss, logits = outputs[:2]
|
loss, logits = outputs[:2]
|
||||||
|
|
||||||
# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
||||||
@@ -96,7 +96,7 @@ for batch in train_data:
|
|||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
|
|||||||
@@ -11,19 +11,19 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
|
|||||||
``AutoConfig``
|
``AutoConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AutoConfig
|
.. autoclass:: transformers.AutoConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``AutoModel``
|
``AutoModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AutoModel
|
.. autoclass:: transformers.AutoModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``AutoTokenizer``
|
``AutoTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.AutoTokenizer
|
.. autoclass:: transformers.AutoTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,69 +4,125 @@ BERT
|
|||||||
``BertConfig``
|
``BertConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertConfig
|
.. autoclass:: transformers.BertConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertTokenizer``
|
``BertTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertTokenizer
|
.. autoclass:: transformers.BertTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertModel``
|
``BertModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertModel
|
.. autoclass:: transformers.BertModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForPreTraining``
|
``BertForPreTraining``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForPreTraining
|
.. autoclass:: transformers.BertForPreTraining
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForMaskedLM``
|
``BertForMaskedLM``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForMaskedLM
|
.. autoclass:: transformers.BertForMaskedLM
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForNextSentencePrediction``
|
``BertForNextSentencePrediction``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
|
.. autoclass:: transformers.BertForNextSentencePrediction
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForSequenceClassification``
|
``BertForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForSequenceClassification
|
.. autoclass:: transformers.BertForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForMultipleChoice``
|
``BertForMultipleChoice``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForMultipleChoice
|
.. autoclass:: transformers.BertForMultipleChoice
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForTokenClassification``
|
``BertForTokenClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForTokenClassification
|
.. autoclass:: transformers.BertForTokenClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``BertForQuestionAnswering``
|
``BertForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.BertForQuestionAnswering
|
.. autoclass:: transformers.BertForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForPreTraining``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForPreTraining
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForNextSentencePrediction``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForNextSentencePrediction
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForMultipleChoice``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForMultipleChoice
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForTokenClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForTokenClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFBertForQuestionAnswering``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFBertForQuestionAnswering
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|||||||
@@ -4,40 +4,67 @@ DistilBERT
|
|||||||
``DistilBertConfig``
|
``DistilBertConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertConfig
|
.. autoclass:: transformers.DistilBertConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertTokenizer``
|
``DistilBertTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertTokenizer
|
.. autoclass:: transformers.DistilBertTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertModel``
|
``DistilBertModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertModel
|
.. autoclass:: transformers.DistilBertModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForMaskedLM``
|
``DistilBertForMaskedLM``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
|
.. autoclass:: transformers.DistilBertForMaskedLM
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForSequenceClassification``
|
``DistilBertForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
|
.. autoclass:: transformers.DistilBertForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForQuestionAnswering``
|
``DistilBertForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
|
.. autoclass:: transformers.DistilBertForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
``TFDistilBertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFDistilBertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFDistilBertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFDistilBertForQuestionAnswering``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFDistilBertForQuestionAnswering
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,33 +4,54 @@ OpenAI GPT
|
|||||||
``OpenAIGPTConfig``
|
``OpenAIGPTConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTConfig
|
.. autoclass:: transformers.OpenAIGPTConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTTokenizer``
|
``OpenAIGPTTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
|
.. autoclass:: transformers.OpenAIGPTTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTModel``
|
``OpenAIGPTModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTModel
|
.. autoclass:: transformers.OpenAIGPTModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTLMHeadModel``
|
``OpenAIGPTLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
|
.. autoclass:: transformers.OpenAIGPTLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``OpenAIGPTDoubleHeadsModel``
|
``OpenAIGPTDoubleHeadsModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
|
.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFOpenAIGPTModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFOpenAIGPTModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFOpenAIGPTLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFOpenAIGPTLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFOpenAIGPTDoubleHeadsModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,33 +4,54 @@ OpenAI GPT2
|
|||||||
``GPT2Config``
|
``GPT2Config``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2Config
|
.. autoclass:: transformers.GPT2Config
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2Tokenizer``
|
``GPT2Tokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2Tokenizer
|
.. autoclass:: transformers.GPT2Tokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2Model``
|
``GPT2Model``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2Model
|
.. autoclass:: transformers.GPT2Model
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2LMHeadModel``
|
``GPT2LMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2LMHeadModel
|
.. autoclass:: transformers.GPT2LMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``GPT2DoubleHeadsModel``
|
``GPT2DoubleHeadsModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
|
.. autoclass:: transformers.GPT2DoubleHeadsModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFGPT2Model``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFGPT2Model
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFGPT2LMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFGPT2LMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFGPT2DoubleHeadsModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFGPT2DoubleHeadsModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,33 +4,54 @@ RoBERTa
|
|||||||
``RobertaConfig``
|
``RobertaConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaConfig
|
.. autoclass:: transformers.RobertaConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaTokenizer``
|
``RobertaTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaTokenizer
|
.. autoclass:: transformers.RobertaTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaModel``
|
``RobertaModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaModel
|
.. autoclass:: transformers.RobertaModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaForMaskedLM``
|
``RobertaForMaskedLM``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaForMaskedLM
|
.. autoclass:: transformers.RobertaForMaskedLM
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``RobertaForSequenceClassification``
|
``RobertaForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
|
.. autoclass:: transformers.RobertaForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFRobertaModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFRobertaModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFRobertaForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFRobertaForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFRobertaForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFRobertaForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -5,26 +5,40 @@ Transformer XL
|
|||||||
``TransfoXLConfig``
|
``TransfoXLConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLConfig
|
.. autoclass:: transformers.TransfoXLConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``TransfoXLTokenizer``
|
``TransfoXLTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLTokenizer
|
.. autoclass:: transformers.TransfoXLTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``TransfoXLModel``
|
``TransfoXLModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLModel
|
.. autoclass:: transformers.TransfoXLModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``TransfoXLLMHeadModel``
|
``TransfoXLLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
|
.. autoclass:: transformers.TransfoXLLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFTransfoXLModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFTransfoXLModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFTransfoXLLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFTransfoXLLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,38 +4,66 @@ XLM
|
|||||||
``XLMConfig``
|
``XLMConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMConfig
|
.. autoclass:: transformers.XLMConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
``XLMTokenizer``
|
``XLMTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMTokenizer
|
.. autoclass:: transformers.XLMTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
``XLMModel``
|
``XLMModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMModel
|
.. autoclass:: transformers.XLMModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLMWithLMHeadModel``
|
``XLMWithLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
|
.. autoclass:: transformers.XLMWithLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLMForSequenceClassification``
|
``XLMForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMForSequenceClassification
|
.. autoclass:: transformers.XLMForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLMForQuestionAnswering``
|
``XLMForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
|
.. autoclass:: transformers.XLMForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMWithLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMWithLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLMForQuestionAnsweringSimple``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -4,40 +4,68 @@ XLNet
|
|||||||
``XLNetConfig``
|
``XLNetConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetConfig
|
.. autoclass:: transformers.XLNetConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetTokenizer``
|
``XLNetTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetTokenizer
|
.. autoclass:: transformers.XLNetTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetModel``
|
``XLNetModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetModel
|
.. autoclass:: transformers.XLNetModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetLMHeadModel``
|
``XLNetLMHeadModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetLMHeadModel
|
.. autoclass:: transformers.XLNetLMHeadModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetForSequenceClassification``
|
``XLNetForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
|
.. autoclass:: transformers.XLNetForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``XLNetForQuestionAnswering``
|
``XLNetForQuestionAnswering``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
|
.. autoclass:: transformers.XLNetForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetLMHeadModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetLMHeadModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFXLNetForQuestionAnsweringSimple``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
@@ -1,16 +1,16 @@
|
|||||||
Notebooks
|
Notebooks
|
||||||
================================================
|
================================================
|
||||||
|
|
||||||
We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
|
We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
|
||||||
|
|
||||||
|
|
||||||
*
|
*
|
||||||
The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
|
The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
|
||||||
|
|
||||||
*
|
*
|
||||||
The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
|
The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
|
||||||
|
|
||||||
*
|
*
|
||||||
The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
|
The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
|
||||||
|
|
||||||
Please follow the instructions given in the notebooks to run and modify them.
|
Please follow the instructions given in the notebooks to run and modify them.
|
||||||
|
|||||||
@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. |
|
| | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. |
|
||||||
| | | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD |
|
| | | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD |
|
||||||
| | | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__). |
|
| | | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__). |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``bert-large-cased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters |
|
| | ``bert-large-cased-whole-word-masking-finetuned-squad`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters |
|
||||||
| | | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD |
|
| | | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD |
|
||||||
| | | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__) |
|
| | | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``bert-base-cased-finetuned-mrpc`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
| | ``bert-base-cased-finetuned-mrpc`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
| | | | The ``bert-base-cased`` model fine-tuned on MRPC |
|
| | | | The ``bert-base-cased`` model fine-tuned on MRPC |
|
||||||
| | | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__) |
|
| | | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
| | | | OpenAI GPT English model |
|
| | | | OpenAI GPT English model |
|
||||||
@@ -120,4 +120,4 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) |
|
| | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
.. <https://huggingface.co/pytorch-transformers/examples.html>`__
|
.. <https://huggingface.co/transformers/examples.html>`__
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Philosophy
|
## Philosophy
|
||||||
|
|
||||||
PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
|
Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
|
||||||
|
|
||||||
The library was designed with two strong goals in mind:
|
The library was designed with two strong goals in mind:
|
||||||
|
|
||||||
@@ -19,12 +19,12 @@ The library was designed with two strong goals in mind:
|
|||||||
|
|
||||||
A few other goals:
|
A few other goals:
|
||||||
|
|
||||||
- expose the models internals as consistently as possible:
|
- expose the models' internals as consistently as possible:
|
||||||
|
|
||||||
- we give access, using a single API to the full hidden-states and attention weights,
|
- we give access, using a single API to the full hidden-states and attention weights,
|
||||||
- tokenizer and base model's API are standardized to easily switch between models.
|
- tokenizer and base model's API are standardized to easily switch between models.
|
||||||
|
|
||||||
- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models:
|
- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
|
||||||
|
|
||||||
- a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
|
- a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
|
||||||
- simple ways to mask and prune transformer heads.
|
- simple ways to mask and prune transformer heads.
|
||||||
@@ -33,13 +33,13 @@ A few other goals:
|
|||||||
|
|
||||||
The library is build around three type of classes for each models:
|
The library is build around three type of classes for each models:
|
||||||
|
|
||||||
- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
|
- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
|
||||||
- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
|
- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
|
||||||
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
|
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
|
||||||
|
|
||||||
All these classes can be instantiated from pretrained instances and saved locally using two methods:
|
All these classes can be instantiated from pretrained instances and saved locally using two methods:
|
||||||
|
|
||||||
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
|
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
|
||||||
- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
|
- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
|
||||||
|
|
||||||
We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
|
We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
|
||||||
@@ -51,7 +51,7 @@ We'll finish this quickstart tour by going through a few simple quick-start exam
|
|||||||
|
|
||||||
Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
|
Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
|
||||||
|
|
||||||
See full API reference for examples for each model classe.
|
See full API reference for examples for each model class.
|
||||||
|
|
||||||
### BERT example
|
### BERT example
|
||||||
|
|
||||||
@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
|
from transformers import BertTokenizer, BertModel, BertForMaskedLM
|
||||||
|
|
||||||
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
|
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
|
||||||
import logging
|
import logging
|
||||||
@@ -93,8 +93,8 @@ Let's see how we can use `BertModel` to encode our inputs in hidden-states:
|
|||||||
# Load pre-trained model (weights)
|
# Load pre-trained model (weights)
|
||||||
model = BertModel.from_pretrained('bert-base-uncased')
|
model = BertModel.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
# Set the model in evaluation mode to desactivate the DropOut modules
|
# Set the model in evaluation mode to deactivate the DropOut modules
|
||||||
# This is IMPORTANT to have reproductible results during evaluation!
|
# This is IMPORTANT to have reproducible results during evaluation!
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
# If you have a GPU, put everything on cuda
|
# If you have a GPU, put everything on cuda
|
||||||
@@ -106,7 +106,7 @@ model.to('cuda')
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# See the models docstrings for the detail of the inputs
|
# See the models docstrings for the detail of the inputs
|
||||||
outputs = model(tokens_tensor, token_type_ids=segments_tensors)
|
outputs = model(tokens_tensor, token_type_ids=segments_tensors)
|
||||||
# PyTorch-Transformers models always output tuples.
|
# Transformers models always output tuples.
|
||||||
# See the models docstrings for the detail of all the outputs
|
# See the models docstrings for the detail of all the outputs
|
||||||
# In our case, the first element is the hidden state of the last layer of the Bert model
|
# In our case, the first element is the hidden state of the last layer of the Bert model
|
||||||
encoded_layers = outputs[0]
|
encoded_layers = outputs[0]
|
||||||
@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
|
||||||
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
|
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
|
||||||
import logging
|
import logging
|
||||||
@@ -168,8 +168,8 @@ Let's see how to use `GPT2LMHeadModel` to generate the next token following our
|
|||||||
# Load pre-trained model (weights)
|
# Load pre-trained model (weights)
|
||||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||||
|
|
||||||
# Set the model in evaluation mode to desactivate the DropOut modules
|
# Set the model in evaluation mode to deactivate the DropOut modules
|
||||||
# This is IMPORTANT to have reproductible results during evaluation!
|
# This is IMPORTANT to have reproducible results during evaluation!
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
# If you have a GPU, put everything on cuda
|
# If you have a GPU, put everything on cuda
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ where
|
|||||||
* ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
|
* ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
|
||||||
* ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
|
* ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
|
||||||
|
|
||||||
If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
|
If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
|
||||||
|
|
||||||
*
|
*
|
||||||
``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
|
``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
|
||||||
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
|
|||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
|
from transformers import WEIGHTS_NAME, CONFIG_NAME
|
||||||
|
|
||||||
output_dir = "./models/"
|
output_dir = "./models/"
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
|
|||||||
Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
|
Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
|
||||||
their model to be re-used in other programs, such as efficiency-oriented C++ programs.
|
their model to be re-used in other programs, such as efficiency-oriented C++ programs.
|
||||||
|
|
||||||
We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
|
We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
|
||||||
be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
|
be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
|
||||||
they can be exported, and what to be mindful of when using these models with TorchScript.
|
they can be exported, and what to be mindful of when using these models with TorchScript.
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
|
|||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
from pytorch_transformers import BertModel, BertTokenizer, BertConfig
|
from transformers import BertModel, BertTokenizer, BertConfig
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
enc = BertTokenizer.from_pretrained("bert-base-uncased")
|
enc = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ similar API between the different models.
|
|||||||
|
|
||||||
## Language model fine-tuning
|
## Language model fine-tuning
|
||||||
|
|
||||||
Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
|
Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
|
||||||
|
|
||||||
Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT
|
Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT
|
||||||
to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa
|
to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa
|
||||||
@@ -75,7 +75,7 @@ python run_lm_finetuning.py \
|
|||||||
|
|
||||||
## Language generation
|
## Language generation
|
||||||
|
|
||||||
Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
|
Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
|
||||||
|
|
||||||
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
|
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
|
||||||
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
|
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
|
||||||
@@ -91,26 +91,26 @@ python run_generation.py \
|
|||||||
|
|
||||||
## GLUE
|
## GLUE
|
||||||
|
|
||||||
Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
|
Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
|
||||||
|
|
||||||
Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
|
Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
|
||||||
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
|
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
|
||||||
|
|
||||||
GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
|
GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
|
||||||
uncased BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
|
uncased BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
|
||||||
batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
|
batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
|
||||||
between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
|
between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
|
||||||
|
|
||||||
| Task | Metric | Result |
|
| Task | Metric | Result |
|
||||||
|-------|------------------------------|-------------|
|
|-------|------------------------------|-------------|
|
||||||
| CoLA | Matthew's corr | 55.75 |
|
| CoLA | Matthew's corr | 48.87 |
|
||||||
| SST-2 | Accuracy | 92.09 |
|
| SST-2 | Accuracy | 91.74 |
|
||||||
| MRPC | F1/Accuracy | 90.48/86.27 |
|
| MRPC | F1/Accuracy | 90.70/86.27 |
|
||||||
| STS-B | Person/Spearman corr. | 89.03/88.64 |
|
| STS-B | Person/Spearman corr. | 91.39/91.04 |
|
||||||
| QQP | Accuracy/F1 | 90.92/87.72 |
|
| QQP | Accuracy/F1 | 90.79/87.66 |
|
||||||
| MNLI | Matched acc./Mismatched acc. | 83.74/84.06 |
|
| MNLI | Matched acc./Mismatched acc. | 83.70/84.83 |
|
||||||
| QNLI | Accuracy | 91.07 |
|
| QNLI | Accuracy | 89.31 |
|
||||||
| RTE | Accuracy | 68.59 |
|
| RTE | Accuracy | 71.43 |
|
||||||
| WNLI | Accuracy | 43.66 |
|
| WNLI | Accuracy | 43.66 |
|
||||||
|
|
||||||
Some of these results are significantly different from the ones reported on the test set
|
Some of these results are significantly different from the ones reported on the test set
|
||||||
@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606
|
|||||||
|
|
||||||
## SQuAD
|
## SQuAD
|
||||||
|
|
||||||
Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
|
Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
|
||||||
|
|
||||||
#### Fine-tuning on SQuAD
|
#### Fine-tuning on SQuAD
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ import torch
|
|||||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||||
TensorDataset)
|
TensorDataset)
|
||||||
|
|
||||||
from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
||||||
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
||||||
WarmupLinearSchedule)
|
WarmupLinearSchedule)
|
||||||
|
|
||||||
|
|||||||
@@ -35,10 +35,10 @@ from tqdm import tqdm, trange
|
|||||||
|
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForMultipleChoice, BertTokenizer)
|
BertForMultipleChoice, BertTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
# inputs.update({'cls_index': batch[5],
|
# inputs.update({'cls_index': batch[5],
|
||||||
# 'p_mask': batch[6]})
|
# 'p_mask': batch[6]})
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||||
@@ -647,7 +647,7 @@ def main():
|
|||||||
|
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||||
|
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import math
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
|
from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
|||||||
@@ -2,25 +2,34 @@
|
|||||||
|
|
||||||
This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
|
This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
|
||||||
|
|
||||||
|
**2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
|
||||||
|
|
||||||
## What is DistilBERT
|
## What is DistilBERT
|
||||||
|
|
||||||
DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||||
|
|
||||||
For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
|
For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
|
||||||
).
|
). *Please note that we will publish a formal write-up with updated and more complete results in the near future (September 19th).*
|
||||||
|
|
||||||
|
Here's the updated results on the dev sets of GLUE:
|
||||||
|
|
||||||
|
| Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2 | STS-B | WNLI |
|
||||||
|
| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
|
||||||
|
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
||||||
|
| DistilBERT | **75.2** | 49.1 | 81.8 | 90.2 | 87.0 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||||
|
|
||||||
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
|
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
|
||||||
|
|
||||||
## How to use DistilBERT
|
## How to use DistilBERT
|
||||||
|
|
||||||
PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
||||||
|
|
||||||
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
||||||
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
||||||
|
|
||||||
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
||||||
|
|
||||||
|
|||||||
@@ -92,11 +92,11 @@ class Dataset:
|
|||||||
Too short sequences are simply removed. This could be tunedd.
|
Too short sequences are simply removed. This could be tunedd.
|
||||||
"""
|
"""
|
||||||
init_size = len(self)
|
init_size = len(self)
|
||||||
indices = self.lengths > 5
|
indices = self.lengths > 11
|
||||||
self.token_ids = self.token_ids[indices]
|
self.token_ids = self.token_ids[indices]
|
||||||
self.lengths = self.lengths[indices]
|
self.lengths = self.lengths[indices]
|
||||||
new_size = len(self)
|
new_size = len(self)
|
||||||
logger.info(f'Remove {init_size - new_size} too short (<=5 tokens) sequences.')
|
logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
|
||||||
|
|
||||||
def print_statistics(self):
|
def print_statistics(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -18,15 +18,18 @@
|
|||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
import psutil
|
import psutil
|
||||||
|
import time
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import trange, tqdm
|
from tqdm import trange, tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import psutil
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
from torch.optim import AdamW
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import WarmupLinearSchedule
|
||||||
|
|
||||||
from utils import logger
|
from utils import logger
|
||||||
from dataset import Dataset
|
from dataset import Dataset
|
||||||
@@ -58,10 +61,12 @@ class Distiller:
|
|||||||
self.alpha_ce = params.alpha_ce
|
self.alpha_ce = params.alpha_ce
|
||||||
self.alpha_mlm = params.alpha_mlm
|
self.alpha_mlm = params.alpha_mlm
|
||||||
self.alpha_mse = params.alpha_mse
|
self.alpha_mse = params.alpha_mse
|
||||||
|
self.alpha_cos = params.alpha_cos
|
||||||
assert self.alpha_ce >= 0.
|
assert self.alpha_ce >= 0.
|
||||||
assert self.alpha_mlm >= 0.
|
assert self.alpha_mlm >= 0.
|
||||||
assert self.alpha_mse >= 0.
|
assert self.alpha_mse >= 0.
|
||||||
assert self.alpha_ce + self.alpha_mlm + self.alpha_mse > 0.
|
assert self.alpha_cos >= 0.
|
||||||
|
assert self.alpha_ce + self.alpha_mlm + self.alpha_mse + self.alpha_cos > 0.
|
||||||
|
|
||||||
self.mlm_mask_prop = params.mlm_mask_prop
|
self.mlm_mask_prop = params.mlm_mask_prop
|
||||||
assert 0.0 <= self.mlm_mask_prop <= 1.0
|
assert 0.0 <= self.mlm_mask_prop <= 1.0
|
||||||
@@ -81,17 +86,21 @@ class Distiller:
|
|||||||
self.last_loss = 0
|
self.last_loss = 0
|
||||||
self.last_loss_ce = 0
|
self.last_loss_ce = 0
|
||||||
self.last_loss_mlm = 0
|
self.last_loss_mlm = 0
|
||||||
self.last_loss_mse = 0
|
if self.alpha_mse > 0.: self.last_loss_mse = 0
|
||||||
|
if self.alpha_cos > 0.: self.last_loss_cos = 0
|
||||||
|
self.last_log = 0
|
||||||
|
|
||||||
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
|
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
|
||||||
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
||||||
self.mse_loss_fct = nn.MSELoss(reduction='sum')
|
if self.alpha_mse > 0.:
|
||||||
|
self.mse_loss_fct = nn.MSELoss(reduction='sum')
|
||||||
|
if self.alpha_cos > 0.:
|
||||||
|
self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
|
||||||
|
|
||||||
logger.info('--- Initializing model optimizer')
|
logger.info('--- Initializing model optimizer')
|
||||||
assert params.gradient_accumulation_steps >= 1
|
assert params.gradient_accumulation_steps >= 1
|
||||||
self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
|
self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
|
||||||
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
|
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
|
||||||
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
|
||||||
|
|
||||||
no_decay = ['bias', 'LayerNorm.weight']
|
no_decay = ['bias', 'LayerNorm.weight']
|
||||||
optimizer_grouped_parameters = [
|
optimizer_grouped_parameters = [
|
||||||
@@ -104,9 +113,11 @@ class Distiller:
|
|||||||
lr=params.learning_rate,
|
lr=params.learning_rate,
|
||||||
eps=params.adam_epsilon,
|
eps=params.adam_epsilon,
|
||||||
betas=(0.9, 0.98))
|
betas=(0.9, 0.98))
|
||||||
|
|
||||||
|
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
||||||
self.scheduler = WarmupLinearSchedule(self.optimizer,
|
self.scheduler = WarmupLinearSchedule(self.optimizer,
|
||||||
warmup_steps=warmup_steps,
|
warmup_steps=warmup_steps,
|
||||||
t_total=num_train_optimization_steps)
|
t_total=num_train_optimization_steps)
|
||||||
|
|
||||||
if self.fp16:
|
if self.fp16:
|
||||||
try:
|
try:
|
||||||
@@ -272,11 +283,14 @@ class Distiller:
|
|||||||
The real training loop.
|
The real training loop.
|
||||||
"""
|
"""
|
||||||
if self.is_master: logger.info('Starting training')
|
if self.is_master: logger.info('Starting training')
|
||||||
|
self.last_log = time.time()
|
||||||
self.student.train()
|
self.student.train()
|
||||||
self.teacher.eval()
|
self.teacher.eval()
|
||||||
|
|
||||||
for _ in range(self.params.n_epoch):
|
for _ in range(self.params.n_epoch):
|
||||||
if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
|
if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
|
||||||
|
if self.multi_gpu:
|
||||||
|
torch.distributed.barrier()
|
||||||
|
|
||||||
iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
|
iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
|
||||||
for __ in range(self.num_steps_epoch):
|
for __ in range(self.num_steps_epoch):
|
||||||
@@ -314,9 +328,9 @@ class Distiller:
|
|||||||
attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
|
attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
|
||||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
|
mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
|
||||||
"""
|
"""
|
||||||
s_logits = self.student(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
|
s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
t_logits = self.teacher(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
|
t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
||||||
assert s_logits.size() == t_logits.size()
|
assert s_logits.size() == t_logits.size()
|
||||||
|
|
||||||
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||||
@@ -340,6 +354,22 @@ class Distiller:
|
|||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
|
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
|
||||||
loss += self.alpha_mse * loss_mse
|
loss += self.alpha_mse * loss_mse
|
||||||
|
|
||||||
|
if self.alpha_cos > 0.:
|
||||||
|
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
|
||||||
|
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
|
||||||
|
mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim)
|
||||||
|
assert s_hidden_states.size() == t_hidden_states.size()
|
||||||
|
dim = s_hidden_states.size(-1)
|
||||||
|
|
||||||
|
s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim)
|
||||||
|
s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
|
||||||
|
t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim)
|
||||||
|
t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
|
||||||
|
|
||||||
|
target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
|
||||||
|
loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
|
||||||
|
loss += self.alpha_cos * loss_cos
|
||||||
|
|
||||||
self.total_loss_epoch += loss.item()
|
self.total_loss_epoch += loss.item()
|
||||||
self.last_loss = loss.item()
|
self.last_loss = loss.item()
|
||||||
@@ -348,6 +378,8 @@ class Distiller:
|
|||||||
self.last_loss_mlm = loss_mlm.item()
|
self.last_loss_mlm = loss_mlm.item()
|
||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
self.last_loss_mse = loss_mse.item()
|
self.last_loss_mse = loss_mse.item()
|
||||||
|
if self.alpha_cos > 0.:
|
||||||
|
self.last_loss_cos = loss_cos.item()
|
||||||
|
|
||||||
self.optimize(loss)
|
self.optimize(loss)
|
||||||
|
|
||||||
@@ -396,6 +428,7 @@ class Distiller:
|
|||||||
|
|
||||||
if self.n_total_iter % self.params.log_interval == 0:
|
if self.n_total_iter % self.params.log_interval == 0:
|
||||||
self.log_tensorboard()
|
self.log_tensorboard()
|
||||||
|
self.last_log = time.time()
|
||||||
if self.n_total_iter % self.params.checkpoint_interval == 0:
|
if self.n_total_iter % self.params.checkpoint_interval == 0:
|
||||||
self.save_checkpoint()
|
self.save_checkpoint()
|
||||||
|
|
||||||
@@ -421,9 +454,12 @@ class Distiller:
|
|||||||
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
|
||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
||||||
|
if self.alpha_cos > 0.:
|
||||||
|
self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
|
||||||
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
|
||||||
|
|
||||||
self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
|
||||||
|
self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
|
||||||
|
|
||||||
def end_epoch(self):
|
def end_epoch(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -2,3 +2,5 @@ gitpython==3.0.2
|
|||||||
tensorboard>=1.14.0
|
tensorboard>=1.14.0
|
||||||
tensorboardX==1.8
|
tensorboardX==1.8
|
||||||
psutil==5.6.3
|
psutil==5.6.3
|
||||||
|
scipy==1.3.1
|
||||||
|
pytorch_transformers==1.2.0
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ import pickle
|
|||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pytorch_transformers import BertTokenizer
|
from transformers import BertTokenizer, RobertaTokenizer
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
@@ -32,16 +32,21 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
||||||
parser.add_argument('--file_path', type=str, default='data/dump.txt',
|
parser.add_argument('--file_path', type=str, default='data/dump.txt',
|
||||||
help='The path to the data.')
|
help='The path to the data.')
|
||||||
parser.add_argument('--bert_tokenizer', type=str, default='bert-base-uncased',
|
parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta'])
|
||||||
|
parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
|
||||||
help="The tokenizer to use.")
|
help="The tokenizer to use.")
|
||||||
parser.add_argument('--dump_file', type=str, default='data/dump',
|
parser.add_argument('--dump_file', type=str, default='data/dump',
|
||||||
help='The dump file prefix.')
|
help='The dump file prefix.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
logger.info(f'Loading Tokenizer ({args.bert_tokenizer})')
|
logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
|
||||||
bert_tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer)
|
if args.tokenizer_type == 'bert':
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
|
||||||
|
elif args.tokenizer_type == 'roberta':
|
||||||
|
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
|
||||||
|
bos = tokenizer.special_tokens_map['bos_token'] # `[CLS]` for bert, `<s>` for roberta
|
||||||
|
sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` for bert, `</s>` for roberta
|
||||||
|
|
||||||
logger.info(f'Loading text from {args.file_path}')
|
logger.info(f'Loading text from {args.file_path}')
|
||||||
with open(args.file_path, 'r', encoding='utf8') as fp:
|
with open(args.file_path, 'r', encoding='utf8') as fp:
|
||||||
@@ -56,8 +61,8 @@ def main():
|
|||||||
interval = 10000
|
interval = 10000
|
||||||
start = time.time()
|
start = time.time()
|
||||||
for text in data:
|
for text in data:
|
||||||
text = f'[CLS] {text.strip()} [SEP]'
|
text = f'{bos} {text.strip()} {sep}'
|
||||||
token_ids = bert_tokenizer.encode(text)
|
token_ids = tokenizer.encode(text)
|
||||||
rslt.append(token_ids)
|
rslt.append(token_ids)
|
||||||
|
|
||||||
iter += 1
|
iter += 1
|
||||||
@@ -69,7 +74,7 @@ def main():
|
|||||||
logger.info(f'{len(data)} examples processed.')
|
logger.info(f'{len(data)} examples processed.')
|
||||||
|
|
||||||
|
|
||||||
dp_file = f'{args.dump_file}.{args.bert_tokenizer}.pickle'
|
dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
|
||||||
rslt_ = [np.uint16(d) for d in rslt]
|
rslt_ = [np.uint16(d) for d in rslt]
|
||||||
random.shuffle(rslt_)
|
random.shuffle(rslt_)
|
||||||
logger.info(f'Dump to {dp_file}')
|
logger.info(f'Dump to {dp_file}')
|
||||||
|
|||||||
@@ -15,59 +15,73 @@
|
|||||||
"""
|
"""
|
||||||
Preprocessing script before training DistilBERT.
|
Preprocessing script before training DistilBERT.
|
||||||
"""
|
"""
|
||||||
from pytorch_transformers import BertForPreTraining
|
from transformers import BertForMaskedLM, RobertaForMaskedLM
|
||||||
import torch
|
import torch
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForPreTraining for Transfer Learned Distillation")
|
parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
|
||||||
parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
|
parser.add_argument("--model_type", default="bert", choices=["bert", "roberta"])
|
||||||
parser.add_argument("--dump_checkpoint", default='serialization_dir/transfer_learning_checkpoint_0247911.pth', type=str)
|
parser.add_argument("--model_name", default='bert-base-uncased', type=str)
|
||||||
|
parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
|
||||||
parser.add_argument("--vocab_transform", action='store_true')
|
parser.add_argument("--vocab_transform", action='store_true')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
model = BertForPreTraining.from_pretrained(args.bert_model)
|
if args.model_type == 'bert':
|
||||||
|
model = BertForMaskedLM.from_pretrained(args.model_name)
|
||||||
|
prefix = 'bert'
|
||||||
|
elif args.model_type == 'roberta':
|
||||||
|
model = RobertaForMaskedLM.from_pretrained(args.model_name)
|
||||||
|
prefix = 'roberta'
|
||||||
|
|
||||||
state_dict = model.state_dict()
|
state_dict = model.state_dict()
|
||||||
compressed_sd = {}
|
compressed_sd = {}
|
||||||
|
|
||||||
for w in ['word_embeddings', 'position_embeddings']:
|
for w in ['word_embeddings', 'position_embeddings']:
|
||||||
compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
|
compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
|
||||||
state_dict[f'bert.embeddings.{w}.weight']
|
state_dict[f'{prefix}.embeddings.{w}.weight']
|
||||||
for w in ['weight', 'bias']:
|
for w in ['weight', 'bias']:
|
||||||
compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
|
compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
|
||||||
state_dict[f'bert.embeddings.LayerNorm.{w}']
|
state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
|
||||||
|
|
||||||
std_idx = 0
|
std_idx = 0
|
||||||
for teacher_idx in [0, 2, 4, 7, 9, 11]:
|
for teacher_idx in [0, 2, 4, 7, 9, 11]:
|
||||||
for w in ['weight', 'bias']:
|
for w in ['weight', 'bias']:
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.query.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.key.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.value.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
|
||||||
|
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
|
||||||
|
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.output.dense.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
|
||||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
|
compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
|
||||||
state_dict[f'bert.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
|
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
|
||||||
std_idx += 1
|
std_idx += 1
|
||||||
|
|
||||||
compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
|
if args.model_type == 'bert':
|
||||||
compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
|
compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
|
||||||
if args.vocab_transform:
|
compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
|
||||||
for w in ['weight', 'bias']:
|
if args.vocab_transform:
|
||||||
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
|
for w in ['weight', 'bias']:
|
||||||
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
|
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
|
||||||
|
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
|
||||||
|
elif args.model_type == 'roberta':
|
||||||
|
compressed_sd[f'vocab_projector.weight'] = state_dict[f'lm_head.decoder.weight']
|
||||||
|
compressed_sd[f'vocab_projector.bias'] = state_dict[f'lm_head.bias']
|
||||||
|
if args.vocab_transform:
|
||||||
|
for w in ['weight', 'bias']:
|
||||||
|
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'lm_head.dense.{w}']
|
||||||
|
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
|
||||||
|
|
||||||
print(f'N layers selected for distillation: {std_idx}')
|
print(f'N layers selected for distillation: {std_idx}')
|
||||||
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ import shutil
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import BertTokenizer, BertForMaskedLM
|
from transformers import BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
|
||||||
from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
|
from transformers import DistilBertForMaskedLM, DistilBertConfig
|
||||||
|
|
||||||
from distiller import Distiller
|
from distiller import Distiller
|
||||||
from utils import git_log, logger, init_gpu_params, set_seed
|
from utils import git_log, logger, init_gpu_params, set_seed
|
||||||
@@ -70,8 +70,10 @@ def main():
|
|||||||
help="Load student initialization checkpoint.")
|
help="Load student initialization checkpoint.")
|
||||||
parser.add_argument("--from_pretrained_config", default=None, type=str,
|
parser.add_argument("--from_pretrained_config", default=None, type=str,
|
||||||
help="Load student initialization architecture config.")
|
help="Load student initialization architecture config.")
|
||||||
parser.add_argument("--bert_model", default='bert-base-uncased', type=str,
|
parser.add_argument("--teacher_type", default="bert", choices=["bert", "roberta"],
|
||||||
help="The teacher BERT model.")
|
help="Teacher type (BERT, RoBERTa).")
|
||||||
|
parser.add_argument("--teacher_name", default="bert-base-uncased", type=str,
|
||||||
|
help="The teacher model.")
|
||||||
|
|
||||||
parser.add_argument("--temperature", default=2., type=float,
|
parser.add_argument("--temperature", default=2., type=float,
|
||||||
help="Temperature for the softmax temperature.")
|
help="Temperature for the softmax temperature.")
|
||||||
@@ -81,6 +83,8 @@ def main():
|
|||||||
help="Linear weight for the MLM loss. Must be >=0.")
|
help="Linear weight for the MLM loss. Must be >=0.")
|
||||||
parser.add_argument("--alpha_mse", default=0.0, type=float,
|
parser.add_argument("--alpha_mse", default=0.0, type=float,
|
||||||
help="Linear weight of the MSE loss. Must be >=0.")
|
help="Linear weight of the MSE loss. Must be >=0.")
|
||||||
|
parser.add_argument("--alpha_cos", default=0.0, type=float,
|
||||||
|
help="Linear weight of the cosine embedding loss. Must be >=0.")
|
||||||
parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
|
parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
|
||||||
help="Proportion of tokens for which we need to make a prediction.")
|
help="Proportion of tokens for which we need to make a prediction.")
|
||||||
parser.add_argument("--word_mask", default=0.8, type=float,
|
parser.add_argument("--word_mask", default=0.8, type=float,
|
||||||
@@ -165,11 +169,14 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
### TOKENIZER ###
|
### TOKENIZER ###
|
||||||
bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model)
|
if args.teacher_type == 'bert':
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(args.teacher_name)
|
||||||
|
elif args.teacher_type == 'roberta':
|
||||||
|
tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
|
||||||
special_tok_ids = {}
|
special_tok_ids = {}
|
||||||
for tok_name, tok_symbol in bert_tokenizer.special_tokens_map.items():
|
for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
|
||||||
idx = bert_tokenizer.all_special_tokens.index(tok_symbol)
|
idx = tokenizer.all_special_tokens.index(tok_symbol)
|
||||||
special_tok_ids[tok_name] = bert_tokenizer.all_special_ids[idx]
|
special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
|
||||||
logger.info(f'Special tokens {special_tok_ids}')
|
logger.info(f'Special tokens {special_tok_ids}')
|
||||||
args.special_tok_ids = special_tok_ids
|
args.special_tok_ids = special_tok_ids
|
||||||
|
|
||||||
@@ -197,16 +204,17 @@ def main():
|
|||||||
|
|
||||||
## STUDENT ##
|
## STUDENT ##
|
||||||
if args.from_pretrained_weights is not None:
|
if args.from_pretrained_weights is not None:
|
||||||
assert os.path.isfile(os.path.join(args.from_pretrained_weights))
|
assert os.path.isfile(args.from_pretrained_weights)
|
||||||
assert os.path.isfile(os.path.join(args.from_pretrained_config))
|
assert os.path.isfile(args.from_pretrained_config)
|
||||||
logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
|
logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
|
||||||
logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
|
logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
|
||||||
stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
|
stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
|
||||||
|
stu_architecture_config.output_hidden_states = True
|
||||||
student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
|
student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
|
||||||
config=stu_architecture_config)
|
config=stu_architecture_config)
|
||||||
else:
|
else:
|
||||||
args.vocab_size_or_config_json_file = args.vocab_size
|
args.vocab_size_or_config_json_file = args.vocab_size
|
||||||
stu_architecture_config = DistilBertConfig(**vars(args))
|
stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True)
|
||||||
student = DistilBertForMaskedLM(stu_architecture_config)
|
student = DistilBertForMaskedLM(stu_architecture_config)
|
||||||
|
|
||||||
|
|
||||||
@@ -216,10 +224,13 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
## TEACHER ##
|
## TEACHER ##
|
||||||
teacher = BertForMaskedLM.from_pretrained(args.bert_model)
|
if args.teacher_type == 'bert':
|
||||||
|
teacher = BertForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
|
||||||
|
elif args.teacher_type == 'roberta':
|
||||||
|
teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
|
||||||
if args.n_gpu > 0:
|
if args.n_gpu > 0:
|
||||||
teacher.to(f'cuda:{args.local_rank}')
|
teacher.to(f'cuda:{args.local_rank}')
|
||||||
logger.info(f'Teacher loaded from {args.bert_model}.')
|
logger.info(f'Teacher loaded from {args.teacher_name}.')
|
||||||
|
|
||||||
## DISTILLER ##
|
## DISTILLER ##
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
|
|||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from torch.nn import CrossEntropyLoss, MSELoss
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME,
|
from transformers import (WEIGHTS_NAME,
|
||||||
BertConfig, BertForSequenceClassification, BertTokenizer,
|
BertConfig, BertForSequenceClassification, BertTokenizer,
|
||||||
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
||||||
XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
|
XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
|
||||||
|
|||||||
@@ -26,12 +26,12 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
|
from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
|
||||||
|
|
||||||
from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||||
from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
|
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
|
||||||
from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
|
from transformers import XLNetLMHeadModel, XLNetTokenizer
|
||||||
from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
|
from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForSequenceClassification, BertTokenizer,
|
BertForSequenceClassification, BertTokenizer,
|
||||||
RobertaConfig,
|
RobertaConfig,
|
||||||
RobertaForSequenceClassification,
|
RobertaForSequenceClassification,
|
||||||
@@ -39,12 +39,17 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLMConfig, XLMForSequenceClassification,
|
XLMConfig, XLMForSequenceClassification,
|
||||||
XLMTokenizer, XLNetConfig,
|
XLMTokenizer, XLNetConfig,
|
||||||
XLNetForSequenceClassification,
|
XLNetForSequenceClassification,
|
||||||
XLNetTokenizer)
|
XLNetTokenizer,
|
||||||
|
DistilBertConfig,
|
||||||
|
DistilBertForSequenceClassification,
|
||||||
|
DistilBertTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
from utils_glue import (compute_metrics, convert_examples_to_features,
|
from transformers import glue_compute_metrics as compute_metrics
|
||||||
output_modes, processors)
|
from transformers import glue_output_modes as output_modes
|
||||||
|
from transformers import glue_processors as processors
|
||||||
|
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -55,6 +60,7 @@ MODEL_CLASSES = {
|
|||||||
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -128,10 +134,11 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
batch = tuple(t.to(args.device) for t in batch)
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
inputs = {'input_ids': batch[0],
|
inputs = {'input_ids': batch[0],
|
||||||
'attention_mask': batch[1],
|
'attention_mask': batch[1],
|
||||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids
|
|
||||||
'labels': batch[3]}
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
@@ -148,8 +155,8 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
|
|
||||||
tr_loss += loss.item()
|
tr_loss += loss.item()
|
||||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||||
scheduler.step() # Update learning rate schedule
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
scheduler.step() # Update learning rate schedule
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
@@ -218,8 +225,9 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
inputs = {'input_ids': batch[0],
|
inputs = {'input_ids': batch[0],
|
||||||
'attention_mask': batch[1],
|
'attention_mask': batch[1],
|
||||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids
|
|
||||||
'labels': batch[3]}
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
tmp_eval_loss, logits = outputs[:2]
|
tmp_eval_loss, logits = outputs[:2]
|
||||||
|
|
||||||
@@ -272,15 +280,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
# HACK(label indices are swapped in RoBERTa pretrained model)
|
# HACK(label indices are swapped in RoBERTa pretrained model)
|
||||||
label_list[1], label_list[2] = label_list[2], label_list[1]
|
label_list[1], label_list[2] = label_list[2], label_list[1]
|
||||||
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||||
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
|
features = convert_examples_to_features(examples,
|
||||||
cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end
|
tokenizer,
|
||||||
cls_token=tokenizer.cls_token,
|
label_list=label_list,
|
||||||
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
|
max_length=args.max_seq_length,
|
||||||
sep_token=tokenizer.sep_token,
|
output_mode=output_mode,
|
||||||
sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
||||||
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
|
||||||
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
|
|
||||||
)
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
@@ -291,14 +298,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
|
|
||||||
# Convert to Tensors and build dataset
|
# Convert to Tensors and build dataset
|
||||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||||
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||||
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||||
if output_mode == "classification":
|
if output_mode == "classification":
|
||||||
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||||
elif output_mode == "regression":
|
elif output_mode == "regression":
|
||||||
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||||
|
|
||||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
@@ -478,7 +485,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
|||||||
@@ -35,11 +35,12 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
|
from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
|
||||||
BertConfig, BertForMaskedLM, BertTokenizer,
|
BertConfig, BertForMaskedLM, BertTokenizer,
|
||||||
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
||||||
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
||||||
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
|
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
|
||||||
|
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -49,7 +50,8 @@ MODEL_CLASSES = {
|
|||||||
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
||||||
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||||
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
|
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -57,7 +59,7 @@ class TextDataset(Dataset):
|
|||||||
def __init__(self, tokenizer, file_path='train', block_size=512):
|
def __init__(self, tokenizer, file_path='train', block_size=512):
|
||||||
assert os.path.isfile(file_path)
|
assert os.path.isfile(file_path)
|
||||||
directory, filename = os.path.split(file_path)
|
directory, filename = os.path.split(file_path)
|
||||||
cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
|
cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
|
||||||
|
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file):
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
@@ -72,9 +74,8 @@ class TextDataset(Dataset):
|
|||||||
|
|
||||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||||
|
|
||||||
while len(tokenized_text) >= block_size: # Truncate in block of block_size
|
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
||||||
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
|
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
|
||||||
tokenized_text = tokenized_text[block_size:]
|
|
||||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||||
# can change this behavior by adding (model specific) padding.
|
# can change this behavior by adding (model specific) padding.
|
||||||
@@ -186,7 +187,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
labels = labels.to(args.device)
|
labels = labels.to(args.device)
|
||||||
model.train()
|
model.train()
|
||||||
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
|
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
@@ -380,7 +381,7 @@ def main():
|
|||||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.model_type in ["bert", "roberta"] and not args.mlm:
|
if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
|
||||||
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||||
"flag (masked language modeling).")
|
"flag (masked language modeling).")
|
||||||
if args.eval_data_file is None and args.do_eval:
|
if args.eval_data_file is None and args.do_eval:
|
||||||
@@ -479,7 +480,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
|||||||
@@ -32,13 +32,13 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForMultipleChoice, BertTokenizer,
|
BertForMultipleChoice, BertTokenizer,
|
||||||
XLNetConfig, XLNetForMultipleChoice,
|
XLNetConfig, XLNetForMultipleChoice,
|
||||||
XLNetTokenizer, RobertaConfig,
|
XLNetTokenizer, RobertaConfig,
|
||||||
RobertaForMultipleChoice, RobertaTokenizer)
|
RobertaForMultipleChoice, RobertaTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
from utils_multiple_choice import (convert_examples_to_features, processors)
|
from utils_multiple_choice import (convert_examples_to_features, processors)
|
||||||
|
|
||||||
@@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
|
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
|
||||||
'labels': batch[3]}
|
'labels': batch[3]}
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
@@ -508,7 +508,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
@@ -524,7 +524,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
# if args.eval_all_checkpoints: # can not use this to do test!!
|
# if args.eval_all_checkpoints: # can not use this to do test!!
|
||||||
# checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
# checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
# logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
# logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
|||||||
@@ -32,14 +32,15 @@ from tqdm import tqdm, trange
|
|||||||
|
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForQuestionAnswering, BertTokenizer,
|
BertForQuestionAnswering, BertTokenizer,
|
||||||
XLMConfig, XLMForQuestionAnswering,
|
XLMConfig, XLMForQuestionAnswering,
|
||||||
XLMTokenizer, XLNetConfig,
|
XLMTokenizer, XLNetConfig,
|
||||||
XLNetForQuestionAnswering,
|
XLNetForQuestionAnswering,
|
||||||
XLNetTokenizer)
|
XLNetTokenizer,
|
||||||
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
|
|
||||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, WarmupLinearSchedule
|
||||||
|
|
||||||
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||||
RawResult, write_predictions,
|
RawResult, write_predictions,
|
||||||
@@ -59,6 +60,7 @@ MODEL_CLASSES = {
|
|||||||
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
||||||
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
def set_seed(args):
|
def set_seed(args):
|
||||||
@@ -140,7 +142,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
inputs.update({'cls_index': batch[5],
|
inputs.update({'cls_index': batch[5],
|
||||||
'p_mask': batch[6]})
|
'p_mask': batch[6]})
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1:
|
||||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||||
@@ -508,7 +510,7 @@ def main():
|
|||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||||
|
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
|
||||||
|
|||||||
40
examples/run_tf_glue.py
Normal file
40
examples/run_tf_glue.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow_datasets
|
||||||
|
from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
|
||||||
|
|
||||||
|
# Load dataset, tokenizer, model from pretrained model/vocabulary
|
||||||
|
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||||
|
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
|
||||||
|
data = tensorflow_datasets.load('glue/mrpc')
|
||||||
|
|
||||||
|
# Prepare dataset for GLUE as a tf.data.Dataset instance
|
||||||
|
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
|
||||||
|
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
|
||||||
|
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
|
||||||
|
valid_dataset = valid_dataset.batch(64)
|
||||||
|
|
||||||
|
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||||
|
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
|
||||||
|
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||||
|
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
||||||
|
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
|
||||||
|
|
||||||
|
# Train and evaluate using tf.keras.Model.fit()
|
||||||
|
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
|
||||||
|
validation_data=valid_dataset, validation_steps=7)
|
||||||
|
|
||||||
|
# Load the TensorFlow model in PyTorch for inspection
|
||||||
|
model.save_pretrained('./save/')
|
||||||
|
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
|
||||||
|
|
||||||
|
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
|
||||||
|
sentence_0 = "This research was consistent with his findings."
|
||||||
|
sentence_1 = "His findings were compatible with this research."
|
||||||
|
sentence_2 = "His findings were not compatible with this research."
|
||||||
|
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
|
||||||
|
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
|
||||||
|
|
||||||
|
pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
|
||||||
|
pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
|
||||||
|
print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
|
||||||
|
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
|
||||||
@@ -24,7 +24,7 @@ import math
|
|||||||
import collections
|
import collections
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||||
|
|
||||||
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
|
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
|
||||||
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
|
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
|
||||||
|
|||||||
50
hubconf.py
50
hubconf.py
@@ -1,7 +1,7 @@
|
|||||||
from pytorch_transformers import (
|
from transformers import (
|
||||||
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
|
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
|
||||||
)
|
)
|
||||||
from pytorch_transformers.file_utils import add_start_docstrings
|
from transformers.file_utils import add_start_docstrings
|
||||||
|
|
||||||
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
|
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
|
||||||
|
|
||||||
@@ -11,12 +11,12 @@ def config(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.
|
config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
|
config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
|
||||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
|
config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
|
||||||
assert config.output_attention == True
|
assert config.output_attention == True
|
||||||
config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
|
config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
|
||||||
assert config.output_attention == True
|
assert config.output_attention == True
|
||||||
assert unused_kwargs == {'foo': False}
|
assert unused_kwargs == {'foo': False}
|
||||||
|
|
||||||
@@ -31,8 +31,8 @@ def tokenizer(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
|
tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -45,13 +45,13 @@ def model(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -63,13 +63,13 @@ def modelWithLMHead(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
|
return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
|
||||||
@@ -81,13 +81,13 @@ def modelForSequenceClassification(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -100,13 +100,13 @@ def modelForQuestionAnswering(*args, **kwargs):
|
|||||||
# Using torch.hub !
|
# Using torch.hub !
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
assert model.config.output_attention == True
|
assert model.config.output_attention == True
|
||||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
|
return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
|
||||||
|
|||||||
@@ -1,75 +0,0 @@
|
|||||||
__version__ = "1.2.0"
|
|
||||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
|
||||||
# default Python logging output behavior when present.
|
|
||||||
# see: https://github.com/abseil/abseil-py/issues/99
|
|
||||||
# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
|
|
||||||
try:
|
|
||||||
import absl.logging
|
|
||||||
absl.logging.set_verbosity('info')
|
|
||||||
absl.logging.set_stderrthreshold('info')
|
|
||||||
absl.logging._warn_preinit_stderr = False
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Tokenizer
|
|
||||||
from .tokenization_utils import (PreTrainedTokenizer)
|
|
||||||
from .tokenization_auto import AutoTokenizer
|
|
||||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
|
||||||
from .tokenization_openai import OpenAIGPTTokenizer
|
|
||||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
|
||||||
from .tokenization_gpt2 import GPT2Tokenizer
|
|
||||||
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
|
||||||
from .tokenization_xlm import XLMTokenizer
|
|
||||||
from .tokenization_roberta import RobertaTokenizer
|
|
||||||
from .tokenization_distilbert import DistilBertTokenizer
|
|
||||||
|
|
||||||
# Configurations
|
|
||||||
from .configuration_utils import PretrainedConfig
|
|
||||||
from .configuration_auto import AutoConfig
|
|
||||||
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
# Modeling
|
|
||||||
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
|
|
||||||
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
|
||||||
AutoModelWithLMHead)
|
|
||||||
|
|
||||||
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
|
|
||||||
BertForMaskedLM, BertForNextSentencePrediction,
|
|
||||||
BertForSequenceClassification, BertForMultipleChoice,
|
|
||||||
BertForTokenClassification, BertForQuestionAnswering,
|
|
||||||
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
|
||||||
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
|
||||||
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
|
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
|
||||||
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
|
||||||
XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
|
|
||||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
|
||||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
|
||||||
XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
|
|
||||||
RobertaForMultipleChoice, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
|
||||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
|
||||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
|
|
||||||
# Optimization
|
|
||||||
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
|
||||||
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
|
||||||
|
|
||||||
# Files and general utilities
|
|
||||||
from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
|
|
||||||
cached_path, add_start_docstrings, add_end_docstrings,
|
|
||||||
WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
|
|
||||||
@@ -1,5 +1,3 @@
|
|||||||
# PyTorch
|
|
||||||
torch>=1.0.0
|
|
||||||
# progress bars in model download and training scripts
|
# progress bars in model download and training scripts
|
||||||
tqdm
|
tqdm
|
||||||
# Accessing files from S3 directly.
|
# Accessing files from S3 directly.
|
||||||
|
|||||||
23
setup.py
23
setup.py
@@ -13,11 +13,11 @@ To create the package for pypi.
|
|||||||
4. Build both the sources and the wheel. Do not change anything in setup.py between
|
4. Build both the sources and the wheel. Do not change anything in setup.py between
|
||||||
creating the wheel and the source distribution (obviously).
|
creating the wheel and the source distribution (obviously).
|
||||||
|
|
||||||
For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
|
For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
|
||||||
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
|
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
|
||||||
|
|
||||||
For the sources, run: "python setup.py sdist"
|
For the sources, run: "python setup.py sdist"
|
||||||
You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
|
You should now have a /dist directory with both .whl and .tar.gz source versions.
|
||||||
|
|
||||||
5. Check that everything looks correct by uploading the package to the pypi test server:
|
5. Check that everything looks correct by uploading the package to the pypi test server:
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ To create the package for pypi.
|
|||||||
(pypi suggest using twine as other methods upload files via plaintext.)
|
(pypi suggest using twine as other methods upload files via plaintext.)
|
||||||
|
|
||||||
Check that you can install it in a virtualenv by running:
|
Check that you can install it in a virtualenv by running:
|
||||||
pip install -i https://testpypi.python.org/pypi pytorch-transformers
|
pip install -i https://testpypi.python.org/pypi transformers
|
||||||
|
|
||||||
6. Upload the final version to actual pypi:
|
6. Upload the final version to actual pypi:
|
||||||
twine upload dist/* -r pypi
|
twine upload dist/* -r pypi
|
||||||
@@ -37,20 +37,19 @@ from io import open
|
|||||||
from setuptools import find_packages, setup
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="pytorch_transformers",
|
name="transformers",
|
||||||
version="1.2.0",
|
version="2.0.0",
|
||||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
|
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||||
author_email="thomas@huggingface.co",
|
author_email="thomas@huggingface.co",
|
||||||
description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
|
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||||
long_description=open("README.md", "r", encoding='utf-8').read(),
|
long_description=open("README.md", "r", encoding='utf-8').read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
|
keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
|
||||||
license='Apache',
|
license='Apache',
|
||||||
url="https://github.com/huggingface/pytorch-transformers",
|
url="https://github.com/huggingface/transformers",
|
||||||
packages=find_packages(exclude=["*.tests", "*.tests.*",
|
packages=find_packages(exclude=["*.tests", "*.tests.*",
|
||||||
"tests.*", "tests"]),
|
"tests.*", "tests"]),
|
||||||
install_requires=['torch>=1.0.0',
|
install_requires=['numpy',
|
||||||
'numpy',
|
|
||||||
'boto3',
|
'boto3',
|
||||||
'requests',
|
'requests',
|
||||||
'tqdm',
|
'tqdm',
|
||||||
@@ -59,7 +58,7 @@ setup(
|
|||||||
'sacremoses'],
|
'sacremoses'],
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
"pytorch_transformers=pytorch_transformers.__main__:main",
|
"transformers=transformers.__main__:main",
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
# python_requires='>=3.5.0',
|
# python_requires='>=3.5.0',
|
||||||
|
|||||||
165
transformers/__init__.py
Normal file
165
transformers/__init__.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
__version__ = "2.0.0"
|
||||||
|
|
||||||
|
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||||
|
# default Python logging output behavior when present.
|
||||||
|
# see: https://github.com/abseil/abseil-py/issues/99
|
||||||
|
# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
|
||||||
|
try:
|
||||||
|
import absl.logging
|
||||||
|
absl.logging.set_verbosity('info')
|
||||||
|
absl.logging.set_stderrthreshold('info')
|
||||||
|
absl.logging._warn_preinit_stderr = False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
# Files and general utilities
|
||||||
|
from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
|
||||||
|
cached_path, add_start_docstrings, add_end_docstrings,
|
||||||
|
WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
|
||||||
|
is_tf_available, is_torch_available)
|
||||||
|
|
||||||
|
from .data import (is_sklearn_available,
|
||||||
|
InputExample, InputFeatures, DataProcessor,
|
||||||
|
glue_output_modes, glue_convert_examples_to_features,
|
||||||
|
glue_processors, glue_tasks_num_labels)
|
||||||
|
|
||||||
|
if is_sklearn_available():
|
||||||
|
from .data import glue_compute_metrics
|
||||||
|
|
||||||
|
# Tokenizers
|
||||||
|
from .tokenization_utils import (PreTrainedTokenizer)
|
||||||
|
from .tokenization_auto import AutoTokenizer
|
||||||
|
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||||
|
from .tokenization_openai import OpenAIGPTTokenizer
|
||||||
|
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||||
|
from .tokenization_gpt2 import GPT2Tokenizer
|
||||||
|
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
||||||
|
from .tokenization_xlm import XLMTokenizer
|
||||||
|
from .tokenization_roberta import RobertaTokenizer
|
||||||
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
|
|
||||||
|
# Configurations
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
from .configuration_auto import AutoConfig
|
||||||
|
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
# Modeling
|
||||||
|
if is_torch_available():
|
||||||
|
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
|
||||||
|
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
||||||
|
AutoModelWithLMHead)
|
||||||
|
|
||||||
|
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
|
||||||
|
BertForMaskedLM, BertForNextSentencePrediction,
|
||||||
|
BertForSequenceClassification, BertForMultipleChoice,
|
||||||
|
BertForTokenClassification, BertForQuestionAnswering,
|
||||||
|
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
||||||
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
||||||
|
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
||||||
|
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
|
||||||
|
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
||||||
|
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
||||||
|
XLNetForSequenceClassification, XLNetForMultipleChoice,
|
||||||
|
XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
|
||||||
|
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
||||||
|
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||||
|
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
||||||
|
XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
|
||||||
|
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
||||||
|
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
||||||
|
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||||
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
# Optimization
|
||||||
|
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
||||||
|
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
||||||
|
|
||||||
|
|
||||||
|
# TensorFlow
|
||||||
|
if is_tf_available():
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
|
||||||
|
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
||||||
|
TFAutoModelWithLMHead)
|
||||||
|
|
||||||
|
from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
|
||||||
|
TFBertModel, TFBertForPreTraining,
|
||||||
|
TFBertForMaskedLM, TFBertForNextSentencePrediction,
|
||||||
|
TFBertForSequenceClassification, TFBertForMultipleChoice,
|
||||||
|
TFBertForTokenClassification, TFBertForQuestionAnswering,
|
||||||
|
load_bert_pt_weights_in_tf2,
|
||||||
|
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
|
||||||
|
TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
|
||||||
|
load_gpt2_pt_weights_in_tf2,
|
||||||
|
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
|
||||||
|
TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
|
||||||
|
load_openai_gpt_pt_weights_in_tf2,
|
||||||
|
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
|
||||||
|
TFTransfoXLModel, TFTransfoXLLMHeadModel,
|
||||||
|
load_transfo_xl_pt_weights_in_tf2,
|
||||||
|
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
||||||
|
TFXLNetModel, TFXLNetLMHeadModel,
|
||||||
|
TFXLNetForSequenceClassification,
|
||||||
|
TFXLNetForQuestionAnsweringSimple,
|
||||||
|
load_xlnet_pt_weights_in_tf2,
|
||||||
|
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
|
||||||
|
TFXLMModel, TFXLMWithLMHeadModel,
|
||||||
|
TFXLMForSequenceClassification,
|
||||||
|
TFXLMForQuestionAnsweringSimple,
|
||||||
|
load_xlm_pt_weights_in_tf2,
|
||||||
|
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
|
||||||
|
TFRobertaModel, TFRobertaForMaskedLM,
|
||||||
|
TFRobertaForSequenceClassification,
|
||||||
|
load_roberta_pt_weights_in_tf2,
|
||||||
|
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
||||||
|
TFDistilBertModel, TFDistilBertForMaskedLM,
|
||||||
|
TFDistilBertForSequenceClassification,
|
||||||
|
TFDistilBertForQuestionAnswering,
|
||||||
|
load_distilbert_pt_weights_in_tf2,
|
||||||
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
# TF 2.0 <=> PyTorch conversion utilities
|
||||||
|
if is_tf_available() and is_torch_available():
|
||||||
|
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
||||||
|
load_pytorch_checkpoint_in_tf2_model,
|
||||||
|
load_pytorch_weights_in_tf2_model,
|
||||||
|
load_pytorch_model_in_tf2_model,
|
||||||
|
load_tf2_checkpoint_in_pytorch_model,
|
||||||
|
load_tf2_weights_in_pytorch_model,
|
||||||
|
load_tf2_model_in_pytorch_model)
|
||||||
|
|
||||||
|
if not is_tf_available() and not is_torch_available():
|
||||||
|
logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
|
||||||
|
"Models won't be available and only tokenizers, configuration"
|
||||||
|
"and file/data utilities can be used.")
|
||||||
@@ -3,36 +3,37 @@ def main():
|
|||||||
import sys
|
import sys
|
||||||
if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
|
if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
|
||||||
print(
|
print(
|
||||||
"Should be used as one of: \n"
|
"This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
|
||||||
">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
|
"It should be used as one of: \n"
|
||||||
">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
|
">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
|
||||||
">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
|
">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
|
||||||
">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
|
">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
|
||||||
">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
|
">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
|
||||||
">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
|
">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
|
||||||
|
">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
|
||||||
else:
|
else:
|
||||||
if sys.argv[1] == "bert":
|
if sys.argv[1] == "bert":
|
||||||
try:
|
try:
|
||||||
from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
|
from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if len(sys.argv) != 5:
|
if len(sys.argv) != 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
|
print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
|
||||||
else:
|
else:
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv.pop()
|
PYTORCH_DUMP_OUTPUT = sys.argv.pop()
|
||||||
TF_CONFIG = sys.argv.pop()
|
TF_CONFIG = sys.argv.pop()
|
||||||
TF_CHECKPOINT = sys.argv.pop()
|
TF_CHECKPOINT = sys.argv.pop()
|
||||||
convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
||||||
elif sys.argv[1] == "gpt":
|
elif sys.argv[1] == "gpt":
|
||||||
from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
|
from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
|
||||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
|
print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
|
||||||
else:
|
else:
|
||||||
OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
|
OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||||
@@ -45,15 +46,15 @@ def main():
|
|||||||
PYTORCH_DUMP_OUTPUT)
|
PYTORCH_DUMP_OUTPUT)
|
||||||
elif sys.argv[1] == "transfo_xl":
|
elif sys.argv[1] == "transfo_xl":
|
||||||
try:
|
try:
|
||||||
from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
|
from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
||||||
else:
|
else:
|
||||||
if 'ckpt' in sys.argv[2].lower():
|
if 'ckpt' in sys.argv[2].lower():
|
||||||
TF_CHECKPOINT = sys.argv[2]
|
TF_CHECKPOINT = sys.argv[2]
|
||||||
@@ -69,16 +70,16 @@ def main():
|
|||||||
convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
|
convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
|
||||||
elif sys.argv[1] == "gpt2":
|
elif sys.argv[1] == "gpt2":
|
||||||
try:
|
try:
|
||||||
from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
|
from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
|
||||||
else:
|
else:
|
||||||
TF_CHECKPOINT = sys.argv[2]
|
TF_CHECKPOINT = sys.argv[2]
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||||
@@ -89,16 +90,16 @@ def main():
|
|||||||
convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
||||||
elif sys.argv[1] == "xlnet":
|
elif sys.argv[1] == "xlnet":
|
||||||
try:
|
try:
|
||||||
from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
|
from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||||
"In that case, it requires TensorFlow to be installed. Please see "
|
"In that case, it requires TensorFlow to be installed. Please see "
|
||||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if len(sys.argv) < 5 or len(sys.argv) > 6:
|
if len(sys.argv) < 5 or len(sys.argv) > 6:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
|
print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
|
||||||
else:
|
else:
|
||||||
TF_CHECKPOINT = sys.argv[2]
|
TF_CHECKPOINT = sys.argv[2]
|
||||||
TF_CONFIG = sys.argv[3]
|
TF_CONFIG = sys.argv[3]
|
||||||
@@ -113,11 +114,11 @@ def main():
|
|||||||
PYTORCH_DUMP_OUTPUT,
|
PYTORCH_DUMP_OUTPUT,
|
||||||
FINETUNING_TASK)
|
FINETUNING_TASK)
|
||||||
elif sys.argv[1] == "xlm":
|
elif sys.argv[1] == "xlm":
|
||||||
from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
|
from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
|
||||||
|
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
|
print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
|
||||||
else:
|
else:
|
||||||
XLM_CHECKPOINT_PATH = sys.argv[2]
|
XLM_CHECKPOINT_PATH = sys.argv[2]
|
||||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||||
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class AutoConfig(object):
|
class AutoConfig(object):
|
||||||
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
|
r""":class:`~transformers.AutoConfig` is a generic configuration class
|
||||||
that will be instantiated as one of the configuration classes of the library
|
that will be instantiated as one of the configuration classes of the library
|
||||||
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -76,7 +76,7 @@ class AutoConfig(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
@@ -45,7 +45,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
|
|
||||||
class BertConfig(PretrainedConfig):
|
class BertConfig(PretrainedConfig):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
|
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a
|
||||||
`BertModel`.
|
`BertModel`.
|
||||||
|
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||||
layer in the Transformer encoder.
|
layer in the Transformer encoder.
|
||||||
hidden_act: The non-linear activation function (function or string) in the
|
hidden_act: The non-linear activation function (function or string) in the
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||||
layers in the embeddings, encoder, and pooler.
|
layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||||
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size_or_config_json_file=30522,
|
||||||
max_position_embeddings=512,
|
max_position_embeddings=512,
|
||||||
sinusoidal_pos_embds=True,
|
sinusoidal_pos_embds=False,
|
||||||
n_layers=6,
|
n_layers=6,
|
||||||
n_heads=12,
|
n_heads=12,
|
||||||
dim=768,
|
dim=768,
|
||||||
@@ -36,7 +36,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -95,10 +95,43 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
init_range=0.01,
|
init_range=0.01,
|
||||||
proj_init_std=0.01,
|
proj_init_std=0.01,
|
||||||
init_std=0.02,
|
init_std=0.02,
|
||||||
|
layer_norm_epsilon=1e-5,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Constructs TransfoXLConfig.
|
"""Constructs TransfoXLConfig.
|
||||||
"""
|
"""
|
||||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||||
|
self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
||||||
|
self.cutoffs = []
|
||||||
|
self.cutoffs.extend(cutoffs)
|
||||||
|
self.tie_weight = tie_weight
|
||||||
|
if proj_share_all_but_first:
|
||||||
|
self.tie_projs = [False] + [True] * len(self.cutoffs)
|
||||||
|
else:
|
||||||
|
self.tie_projs = [False] + [False] * len(self.cutoffs)
|
||||||
|
self.d_model = d_model
|
||||||
|
self.d_embed = d_embed
|
||||||
|
self.d_head = d_head
|
||||||
|
self.d_inner = d_inner
|
||||||
|
self.div_val = div_val
|
||||||
|
self.pre_lnorm = pre_lnorm
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
self.tgt_len = tgt_len
|
||||||
|
self.ext_len = ext_len
|
||||||
|
self.mem_len = mem_len
|
||||||
|
self.same_length = same_length
|
||||||
|
self.attn_type = attn_type
|
||||||
|
self.clamp_len = clamp_len
|
||||||
|
self.sample_softmax = sample_softmax
|
||||||
|
self.adaptive = adaptive
|
||||||
|
self.dropout = dropout
|
||||||
|
self.dropatt = dropatt
|
||||||
|
self.untie_r = untie_r
|
||||||
|
self.init = init
|
||||||
|
self.init_range = init_range
|
||||||
|
self.proj_init_std = proj_init_std
|
||||||
|
self.init_std = init_std
|
||||||
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
@@ -106,39 +139,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
json_config = json.loads(reader.read())
|
json_config = json.loads(reader.read())
|
||||||
for key, value in json_config.items():
|
for key, value in json_config.items():
|
||||||
self.__dict__[key] = value
|
self.__dict__[key] = value
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif not isinstance(vocab_size_or_config_json_file, int):
|
||||||
self.n_token = vocab_size_or_config_json_file
|
|
||||||
self.cutoffs = []
|
|
||||||
self.cutoffs.extend(cutoffs)
|
|
||||||
self.tie_weight = tie_weight
|
|
||||||
if proj_share_all_but_first:
|
|
||||||
self.tie_projs = [False] + [True] * len(self.cutoffs)
|
|
||||||
else:
|
|
||||||
self.tie_projs = [False] + [False] * len(self.cutoffs)
|
|
||||||
self.d_model = d_model
|
|
||||||
self.d_embed = d_embed
|
|
||||||
self.d_head = d_head
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.div_val = div_val
|
|
||||||
self.pre_lnorm = pre_lnorm
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
self.tgt_len = tgt_len
|
|
||||||
self.ext_len = ext_len
|
|
||||||
self.mem_len = mem_len
|
|
||||||
self.same_length = same_length
|
|
||||||
self.attn_type = attn_type
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.sample_softmax = sample_softmax
|
|
||||||
self.adaptive = adaptive
|
|
||||||
self.dropout = dropout
|
|
||||||
self.dropatt = dropatt
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.init = init
|
|
||||||
self.init_range = init_range
|
|
||||||
self.proj_init_std = proj_init_std
|
|
||||||
self.init_std = init_std
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
" or the path to a pretrained model config file (str)")
|
" or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
@@ -54,11 +54,12 @@ class PretrainedConfig(object):
|
|||||||
self.output_attentions = kwargs.pop('output_attentions', False)
|
self.output_attentions = kwargs.pop('output_attentions', False)
|
||||||
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
||||||
self.torchscript = kwargs.pop('torchscript', False)
|
self.torchscript = kwargs.pop('torchscript', False)
|
||||||
|
self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
|
||||||
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
""" Save a configuration object to the directory `save_directory`, so that it
|
""" Save a configuration object to the directory `save_directory`, so that it
|
||||||
can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
|
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
|
||||||
"""
|
"""
|
||||||
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
||||||
|
|
||||||
@@ -66,16 +67,17 @@ class PretrainedConfig(object):
|
|||||||
output_config_file = os.path.join(save_directory, CONFIG_NAME)
|
output_config_file = os.path.join(save_directory, CONFIG_NAME)
|
||||||
|
|
||||||
self.to_json_file(output_config_file)
|
self.to_json_file(output_config_file)
|
||||||
|
logger.info("Configuration saved in {}".format(output_config_file))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
|
r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
@@ -174,7 +176,7 @@ class PretrainedConfig(object):
|
|||||||
"""Constructs a `Config` from a Python dictionary of parameters."""
|
"""Constructs a `Config` from a Python dictionary of parameters."""
|
||||||
config = cls(vocab_size_or_config_json_file=-1)
|
config = cls(vocab_size_or_config_json_file=-1)
|
||||||
for key, value in json_object.items():
|
for key, value in json_object.items():
|
||||||
config.__dict__[key] = value
|
setattr(config, key, value)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -56,8 +56,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
dropout: The dropout probabilitiy for all fully connected
|
||||||
layers in the embeddings, encoder, and pooler.
|
layers in the embeddings, encoder, and pooler.
|
||||||
dropatt: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
max_position_embeddings: The maximum sequence length that this model might
|
max_position_embeddings: The maximum sequence length that this model might
|
||||||
ever be used with. Typically set this to something large just in case
|
ever be used with. Typically set this to something large just in case
|
||||||
(e.g., 512 or 1024 or 2048).
|
(e.g., 512 or 1024 or 2048).
|
||||||
@@ -66,7 +64,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
dropout: float, dropout rate.
|
||||||
dropatt: float, dropout rate on attention probabilities.
|
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
init: str, the initialization scheme, either "normal" or "uniform".
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
init_range: float, initialize the parameters with a uniform distribution
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
in [-init_range, init_range]. Only effective when init="uniform".
|
||||||
@@ -49,14 +49,11 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
dropout: The dropout probabilitiy for all fully connected
|
||||||
layers in the embeddings, encoder, and pooler.
|
layers in the embeddings, encoder, and pooler.
|
||||||
dropatt: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
dropout: float, dropout rate.
|
||||||
dropatt: float, dropout rate on attention probabilities.
|
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
init: str, the initialization scheme, either "normal" or "uniform".
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
init_range: float, initialize the parameters with a uniform distribution
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
in [-init_range, init_range]. Only effective when init="uniform".
|
||||||
@@ -80,6 +77,7 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
n_layer=24,
|
n_layer=24,
|
||||||
n_head=16,
|
n_head=16,
|
||||||
d_inner=4096,
|
d_inner=4096,
|
||||||
|
max_position_embeddings=512,
|
||||||
ff_activation="gelu",
|
ff_activation="gelu",
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
attn_type="bi",
|
attn_type="bi",
|
||||||
@@ -112,7 +110,7 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||||
json_config = json.loads(reader.read())
|
json_config = json.loads(reader.read())
|
||||||
for key, value in json_config.items():
|
for key, value in json_config.items():
|
||||||
self.__dict__[key] = value
|
setattr(config, key, value)
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
self.n_token = vocab_size_or_config_json_file
|
self.n_token = vocab_size_or_config_json_file
|
||||||
self.d_model = d_model
|
self.d_model = d_model
|
||||||
@@ -21,7 +21,7 @@ from __future__ import print_function
|
|||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
|
from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@@ -20,7 +20,7 @@ import argparse
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from pytorch_transformers import BertModel
|
from transformers import BertModel
|
||||||
|
|
||||||
|
|
||||||
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
|
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
|
||||||
@@ -21,7 +21,7 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
from transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
GPT2Config,
|
GPT2Config,
|
||||||
GPT2Model,
|
GPT2Model,
|
||||||
load_tf_weights_in_gpt2)
|
load_tf_weights_in_gpt2)
|
||||||
@@ -21,7 +21,7 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
from transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
OpenAIGPTConfig,
|
OpenAIGPTConfig,
|
||||||
OpenAIGPTModel,
|
OpenAIGPTModel,
|
||||||
load_tf_weights_in_openai_gpt)
|
load_tf_weights_in_openai_gpt)
|
||||||
233
transformers/convert_pytorch_checkpoint_to_tf2.py
Normal file
233
transformers/convert_pytorch_checkpoint_to_tf2.py
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Convert pytorch checkpoints to TensorFlow """
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from transformers import is_torch_available, cached_path
|
||||||
|
|
||||||
|
from transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
else:
|
||||||
|
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,) = (
|
||||||
|
None, None, None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None,
|
||||||
|
None, None, None,
|
||||||
|
None, None, None,)
|
||||||
|
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
MODEL_CLASSES = {
|
||||||
|
'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'roberta': (RobertaConfig, TFRobertaForMaskedLM, load_roberta_pt_weights_in_tf2, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, load_distilbert_pt_weights_in_tf2, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
}
|
||||||
|
|
||||||
|
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
|
||||||
|
if model_type not in MODEL_CLASSES:
|
||||||
|
raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
|
||||||
|
|
||||||
|
config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
|
||||||
|
|
||||||
|
# Initialise TF model
|
||||||
|
if config_file in aws_config_map:
|
||||||
|
config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
|
||||||
|
config = config_class.from_json_file(config_file)
|
||||||
|
config.output_hidden_states = True
|
||||||
|
config.output_attentions = True
|
||||||
|
print("Building TensorFlow model from configuration: {}".format(str(config)))
|
||||||
|
tf_model = model_class(config)
|
||||||
|
|
||||||
|
# Load weights from tf checkpoint
|
||||||
|
if pytorch_checkpoint_path in aws_model_maps:
|
||||||
|
pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
|
||||||
|
tf_model = loading_fct(tf_model, pytorch_checkpoint_path)
|
||||||
|
|
||||||
|
if compare_with_pt_model:
|
||||||
|
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
tf_inputs = tf.constant(inputs_list)
|
||||||
|
tfo = tf_model(tf_inputs, training=False) # build the network
|
||||||
|
|
||||||
|
pt_model = pt_model_class.from_pretrained(None,
|
||||||
|
config=config,
|
||||||
|
state_dict=torch.load(pytorch_checkpoint_path,
|
||||||
|
map_location='cpu'))
|
||||||
|
pt_inputs = torch.tensor(inputs_list)
|
||||||
|
with torch.no_grad():
|
||||||
|
pto = pt_model(pt_inputs)
|
||||||
|
|
||||||
|
np_pt = pto[0].detach().numpy()
|
||||||
|
np_tf = tfo[0].numpy()
|
||||||
|
diff = np.amax(np.abs(np_pt - np_tf))
|
||||||
|
print("Max absolute difference between models outputs {}".format(diff))
|
||||||
|
assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
|
||||||
|
|
||||||
|
# Save pytorch-model
|
||||||
|
print("Save TensorFlow model to {}".format(tf_dump_path))
|
||||||
|
tf_model.save_weights(tf_dump_path, save_format='h5')
|
||||||
|
|
||||||
|
|
||||||
|
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
|
||||||
|
compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
|
||||||
|
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
|
||||||
|
|
||||||
|
if args_model_type is None:
|
||||||
|
model_types = list(MODEL_CLASSES.keys())
|
||||||
|
else:
|
||||||
|
model_types = [args_model_type]
|
||||||
|
|
||||||
|
for j, model_type in enumerate(model_types, start=1):
|
||||||
|
print("=" * 100)
|
||||||
|
print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
|
||||||
|
print("=" * 100)
|
||||||
|
if model_type not in MODEL_CLASSES:
|
||||||
|
raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
|
||||||
|
|
||||||
|
config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
|
||||||
|
|
||||||
|
if model_shortcut_names_or_path is None:
|
||||||
|
model_shortcut_names_or_path = list(aws_model_maps.keys())
|
||||||
|
if config_shortcut_names_or_path is None:
|
||||||
|
config_shortcut_names_or_path = model_shortcut_names_or_path
|
||||||
|
|
||||||
|
for i, (model_shortcut_name, config_shortcut_name) in enumerate(
|
||||||
|
zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
|
||||||
|
print("-" * 100)
|
||||||
|
if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
|
||||||
|
if not only_convert_finetuned_models:
|
||||||
|
print(" Skipping finetuned checkpoint {}".format(model_shortcut_name))
|
||||||
|
continue
|
||||||
|
model_type = model_shortcut_name
|
||||||
|
elif only_convert_finetuned_models:
|
||||||
|
print(" Skipping not finetuned checkpoint {}".format(model_shortcut_name))
|
||||||
|
continue
|
||||||
|
print(" Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
|
||||||
|
print("-" * 100)
|
||||||
|
|
||||||
|
if config_shortcut_name in aws_config_map:
|
||||||
|
config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
|
||||||
|
else:
|
||||||
|
config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
|
||||||
|
|
||||||
|
if model_shortcut_name in aws_model_maps:
|
||||||
|
model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
|
||||||
|
else:
|
||||||
|
model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
|
||||||
|
|
||||||
|
convert_pt_checkpoint_to_tf(model_type,
|
||||||
|
model_file,
|
||||||
|
config_file,
|
||||||
|
os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
|
||||||
|
compare_with_pt_model=compare_with_pt_model)
|
||||||
|
os.remove(config_file)
|
||||||
|
os.remove(model_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--tf_dump_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the output Tensorflow dump file.")
|
||||||
|
parser.add_argument("--model_type",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
|
||||||
|
parser.add_argument("--pytorch_checkpoint_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
|
||||||
|
"If not given, will download and convert all the checkpoints from AWS.")
|
||||||
|
parser.add_argument("--config_file",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
help = "The config json file corresponding to the pre-trained model. \n"
|
||||||
|
"This specifies the model architecture. If not given and "
|
||||||
|
"--pytorch_checkpoint_path is not given or is a shortcut name"
|
||||||
|
"use the configuration associated to the shortcut name on the AWS")
|
||||||
|
parser.add_argument("--compare_with_pt_model",
|
||||||
|
action='store_true',
|
||||||
|
help = "Compare Tensorflow and PyTorch model predictions.")
|
||||||
|
parser.add_argument("--use_cached_models",
|
||||||
|
action='store_true',
|
||||||
|
help = "Use cached models if possible instead of updating to latest checkpoint versions.")
|
||||||
|
parser.add_argument("--only_convert_finetuned_models",
|
||||||
|
action='store_true',
|
||||||
|
help = "Only convert finetuned models.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# if args.pytorch_checkpoint_path is not None:
|
||||||
|
# convert_pt_checkpoint_to_tf(args.model_type.lower(),
|
||||||
|
# args.pytorch_checkpoint_path,
|
||||||
|
# args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
|
||||||
|
# args.tf_dump_path,
|
||||||
|
# compare_with_pt_model=args.compare_with_pt_model,
|
||||||
|
# use_cached_models=args.use_cached_models)
|
||||||
|
# else:
|
||||||
|
convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
|
||||||
|
args.tf_dump_path,
|
||||||
|
model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
|
||||||
|
compare_with_pt_model=args.compare_with_pt_model,
|
||||||
|
use_cached_models=args.use_cached_models,
|
||||||
|
only_convert_finetuned_models=args.only_convert_finetuned_models)
|
||||||
@@ -23,12 +23,12 @@ import torch
|
|||||||
|
|
||||||
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
||||||
from fairseq.modules import TransformerSentenceEncoderLayer
|
from fairseq.modules import TransformerSentenceEncoderLayer
|
||||||
from pytorch_transformers import (BertConfig, BertEncoder,
|
from transformers import (BertConfig, BertEncoder,
|
||||||
BertIntermediate, BertLayer,
|
BertIntermediate, BertLayer,
|
||||||
BertModel, BertOutput,
|
BertModel, BertOutput,
|
||||||
BertSelfAttention,
|
BertSelfAttention,
|
||||||
BertSelfOutput)
|
BertSelfOutput)
|
||||||
from pytorch_transformers import (RobertaEmbeddings,
|
from transformers import (RobertaEmbeddings,
|
||||||
RobertaForMaskedLM,
|
RobertaForMaskedLM,
|
||||||
RobertaForSequenceClassification,
|
RobertaForSequenceClassification,
|
||||||
RobertaModel)
|
RobertaModel)
|
||||||
@@ -23,12 +23,12 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import pytorch_transformers.tokenization_transfo_xl as data_utils
|
import transformers.tokenization_transfo_xl as data_utils
|
||||||
|
|
||||||
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
from transformers import CONFIG_NAME, WEIGHTS_NAME
|
||||||
from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
|
from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
|
||||||
load_tf_weights_in_transfo_xl)
|
load_tf_weights_in_transfo_xl)
|
||||||
from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
|
from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
|
||||||
|
|
||||||
if sys.version_info[0] == 2:
|
if sys.version_info[0] == 2:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
@@ -23,8 +23,8 @@ from io import open
|
|||||||
import torch
|
import torch
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
from transformers import CONFIG_NAME, WEIGHTS_NAME
|
||||||
from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
|
from transformers.tokenization_xlm import VOCAB_FILES_NAMES
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@@ -33,7 +33,15 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
|
|||||||
# Load checkpoint
|
# Load checkpoint
|
||||||
chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
|
chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
|
||||||
|
|
||||||
model = chkpt['model']
|
state_dict = chkpt['model']
|
||||||
|
|
||||||
|
# We have the base model one level deeper than the original XLM repository
|
||||||
|
two_levels_state_dict = {}
|
||||||
|
for k, v in state_dict.items():
|
||||||
|
if 'pred_layer' in k:
|
||||||
|
two_levels_state_dict[k] = v
|
||||||
|
else:
|
||||||
|
two_levels_state_dict['transformer.' + k] = v
|
||||||
|
|
||||||
config = chkpt['params']
|
config = chkpt['params']
|
||||||
config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
|
config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
|
||||||
@@ -47,7 +55,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
|
|||||||
pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file']
|
pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file']
|
||||||
|
|
||||||
print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
|
print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
|
||||||
torch.save(model, pytorch_weights_dump_path)
|
torch.save(two_levels_state_dict, pytorch_weights_dump_path)
|
||||||
|
|
||||||
print("Save configuration file to {}".format(pytorch_config_dump_path))
|
print("Save configuration file to {}".format(pytorch_config_dump_path))
|
||||||
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
|
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
|
||||||
@@ -22,7 +22,7 @@ import os
|
|||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
from transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
XLNetConfig,
|
XLNetConfig,
|
||||||
XLNetLMHeadModel, XLNetForQuestionAnswering,
|
XLNetLMHeadModel, XLNetForQuestionAnswering,
|
||||||
XLNetForSequenceClassification,
|
XLNetForSequenceClassification,
|
||||||
6
transformers/data/__init__.py
Normal file
6
transformers/data/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from .processors import InputExample, InputFeatures, DataProcessor
|
||||||
|
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
|
|
||||||
|
from .metrics import is_sklearn_available
|
||||||
|
if is_sklearn_available():
|
||||||
|
from .metrics import glue_compute_metrics
|
||||||
83
transformers/data/metrics/__init__.py
Normal file
83
transformers/data/metrics/__init__.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from scipy.stats import pearsonr, spearmanr
|
||||||
|
from sklearn.metrics import matthews_corrcoef, f1_score
|
||||||
|
_has_sklearn = True
|
||||||
|
except (AttributeError, ImportError) as e:
|
||||||
|
logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
|
||||||
|
_has_sklearn = False
|
||||||
|
|
||||||
|
def is_sklearn_available():
|
||||||
|
return _has_sklearn
|
||||||
|
|
||||||
|
if _has_sklearn:
|
||||||
|
|
||||||
|
def simple_accuracy(preds, labels):
|
||||||
|
return (preds == labels).mean()
|
||||||
|
|
||||||
|
|
||||||
|
def acc_and_f1(preds, labels):
|
||||||
|
acc = simple_accuracy(preds, labels)
|
||||||
|
f1 = f1_score(y_true=labels, y_pred=preds)
|
||||||
|
return {
|
||||||
|
"acc": acc,
|
||||||
|
"f1": f1,
|
||||||
|
"acc_and_f1": (acc + f1) / 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def pearson_and_spearman(preds, labels):
|
||||||
|
pearson_corr = pearsonr(preds, labels)[0]
|
||||||
|
spearman_corr = spearmanr(preds, labels)[0]
|
||||||
|
return {
|
||||||
|
"pearson": pearson_corr,
|
||||||
|
"spearmanr": spearman_corr,
|
||||||
|
"corr": (pearson_corr + spearman_corr) / 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def glue_compute_metrics(task_name, preds, labels):
|
||||||
|
assert len(preds) == len(labels)
|
||||||
|
if task_name == "cola":
|
||||||
|
return {"mcc": matthews_corrcoef(labels, preds)}
|
||||||
|
elif task_name == "sst-2":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "mrpc":
|
||||||
|
return acc_and_f1(preds, labels)
|
||||||
|
elif task_name == "sts-b":
|
||||||
|
return pearson_and_spearman(preds, labels)
|
||||||
|
elif task_name == "qqp":
|
||||||
|
return acc_and_f1(preds, labels)
|
||||||
|
elif task_name == "mnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "mnli-mm":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "qnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "rte":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
elif task_name == "wnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
else:
|
||||||
|
raise KeyError(task_name)
|
||||||
3
transformers/data/processors/__init__.py
Normal file
3
transformers/data/processors/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .utils import InputExample, InputFeatures, DataProcessor
|
||||||
|
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
|
|
||||||
@@ -13,84 +13,154 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" BERT classification fine-tuning: utilities to work with GLUE tasks """
|
""" GLUE processors and helpers """
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
from io import open
|
|
||||||
|
|
||||||
from scipy.stats import pearsonr, spearmanr
|
from .utils import DataProcessor, InputExample, InputFeatures
|
||||||
from sklearn.metrics import matthews_corrcoef, f1_score
|
from ...file_utils import is_tf_available
|
||||||
|
|
||||||
|
if is_tf_available():
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class InputExample(object):
|
def glue_convert_examples_to_features(examples, tokenizer,
|
||||||
"""A single training/test example for simple sequence classification."""
|
max_length=512,
|
||||||
|
task=None,
|
||||||
|
label_list=None,
|
||||||
|
output_mode=None,
|
||||||
|
pad_on_left=False,
|
||||||
|
pad_token=0,
|
||||||
|
pad_token_segment_id=0,
|
||||||
|
mask_padding_with_zero=True):
|
||||||
|
"""
|
||||||
|
Loads a data file into a list of ``InputFeatures``
|
||||||
|
|
||||||
def __init__(self, guid, text_a, text_b=None, label=None):
|
Args:
|
||||||
"""Constructs a InputExample.
|
examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
|
||||||
|
tokenizer: Instance of a tokenizer that will tokenize the examples
|
||||||
|
max_length: Maximum example length
|
||||||
|
task: GLUE task
|
||||||
|
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
|
||||||
|
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
|
||||||
|
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
|
||||||
|
pad_token: Padding token
|
||||||
|
pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
|
||||||
|
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
|
||||||
|
and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
|
||||||
|
actual values)
|
||||||
|
|
||||||
Args:
|
Returns:
|
||||||
guid: Unique id for the example.
|
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
|
||||||
text_a: string. The untokenized text of the first sequence. For single
|
containing the task-specific features. If the input is a list of ``InputExamples``, will return
|
||||||
sequence tasks, only this sequence must be specified.
|
a list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||||
text_b: (Optional) string. The untokenized text of the second sequence.
|
|
||||||
Only must be specified for sequence pair tasks.
|
|
||||||
label: (Optional) string. The label of the example. This should be
|
|
||||||
specified for train and dev examples, but not for test examples.
|
|
||||||
"""
|
|
||||||
self.guid = guid
|
|
||||||
self.text_a = text_a
|
|
||||||
self.text_b = text_b
|
|
||||||
self.label = label
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
is_tf_dataset = False
|
||||||
|
if is_tf_available() and isinstance(examples, tf.data.Dataset):
|
||||||
|
is_tf_dataset = True
|
||||||
|
|
||||||
class InputFeatures(object):
|
if task is not None:
|
||||||
"""A single set of features of data."""
|
processor = glue_processors[task]()
|
||||||
|
if label_list is None:
|
||||||
|
label_list = processor.get_labels()
|
||||||
|
logger.info("Using label list %s for task %s" % (label_list, task))
|
||||||
|
if output_mode is None:
|
||||||
|
output_mode = glue_output_modes[task]
|
||||||
|
logger.info("Using output mode %s for task %s" % (output_mode, task))
|
||||||
|
|
||||||
def __init__(self, input_ids, input_mask, segment_ids, label_id):
|
label_map = {label: i for i, label in enumerate(label_list)}
|
||||||
self.input_ids = input_ids
|
|
||||||
self.input_mask = input_mask
|
|
||||||
self.segment_ids = segment_ids
|
|
||||||
self.label_id = label_id
|
|
||||||
|
|
||||||
|
features = []
|
||||||
|
for (ex_index, example) in enumerate(examples):
|
||||||
|
if ex_index % 10000 == 0:
|
||||||
|
logger.info("Writing example %d" % (ex_index))
|
||||||
|
if is_tf_dataset:
|
||||||
|
example = processor.get_example_from_tensor_dict(example)
|
||||||
|
|
||||||
class DataProcessor(object):
|
inputs = tokenizer.encode_plus(
|
||||||
"""Base class for data converters for sequence classification data sets."""
|
example.text_a,
|
||||||
|
example.text_b,
|
||||||
|
add_special_tokens=True,
|
||||||
|
max_length=max_length,
|
||||||
|
truncate_first_sequence=True # We're truncating the first sequence in priority
|
||||||
|
)
|
||||||
|
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||||
"""Gets a collection of `InputExample`s for the train set."""
|
# tokens are attended to.
|
||||||
raise NotImplementedError()
|
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
||||||
|
|
||||||
def get_dev_examples(self, data_dir):
|
# Zero-pad up to the sequence length.
|
||||||
"""Gets a collection of `InputExample`s for the dev set."""
|
padding_length = max_length - len(input_ids)
|
||||||
raise NotImplementedError()
|
if pad_on_left:
|
||||||
|
input_ids = ([pad_token] * padding_length) + input_ids
|
||||||
|
attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
|
||||||
|
token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
|
||||||
|
else:
|
||||||
|
input_ids = input_ids + ([pad_token] * padding_length)
|
||||||
|
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
|
||||||
|
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
|
||||||
|
|
||||||
def get_labels(self):
|
assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
|
||||||
"""Gets the list of labels for this data set."""
|
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
|
||||||
raise NotImplementedError()
|
assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
|
||||||
|
|
||||||
@classmethod
|
if output_mode == "classification":
|
||||||
def _read_tsv(cls, input_file, quotechar=None):
|
label = label_map[example.label]
|
||||||
"""Reads a tab separated value file."""
|
elif output_mode == "regression":
|
||||||
with open(input_file, "r", encoding="utf-8-sig") as f:
|
label = float(example.label)
|
||||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
else:
|
||||||
lines = []
|
raise KeyError(output_mode)
|
||||||
for line in reader:
|
|
||||||
if sys.version_info[0] == 2:
|
if ex_index < 5:
|
||||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
logger.info("*** Example ***")
|
||||||
lines.append(line)
|
logger.info("guid: %s" % (example.guid))
|
||||||
return lines
|
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||||
|
logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
|
||||||
|
logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
|
||||||
|
logger.info("label: %s (id = %d)" % (example.label, label))
|
||||||
|
|
||||||
|
features.append(
|
||||||
|
InputFeatures(input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
label=label))
|
||||||
|
|
||||||
|
if is_tf_available() and is_tf_dataset:
|
||||||
|
def gen():
|
||||||
|
for ex in features:
|
||||||
|
yield ({'input_ids': ex.input_ids,
|
||||||
|
'attention_mask': ex.attention_mask,
|
||||||
|
'token_type_ids': ex.token_type_ids},
|
||||||
|
ex.label)
|
||||||
|
|
||||||
|
return tf.data.Dataset.from_generator(gen,
|
||||||
|
({'input_ids': tf.int32,
|
||||||
|
'attention_mask': tf.int32,
|
||||||
|
'token_type_ids': tf.int32},
|
||||||
|
tf.int64),
|
||||||
|
({'input_ids': tf.TensorShape([None]),
|
||||||
|
'attention_mask': tf.TensorShape([None]),
|
||||||
|
'token_type_ids': tf.TensorShape([None])},
|
||||||
|
tf.TensorShape([])))
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
class MrpcProcessor(DataProcessor):
|
class MrpcProcessor(DataProcessor):
|
||||||
"""Processor for the MRPC data set (GLUE version)."""
|
"""Processor for the MRPC data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
|
logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
|
||||||
@@ -124,6 +194,13 @@ class MrpcProcessor(DataProcessor):
|
|||||||
class MnliProcessor(DataProcessor):
|
class MnliProcessor(DataProcessor):
|
||||||
"""Processor for the MultiNLI data set (GLUE version)."""
|
"""Processor for the MultiNLI data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['premise'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['hypothesis'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -167,6 +244,13 @@ class MnliMismatchedProcessor(MnliProcessor):
|
|||||||
class ColaProcessor(DataProcessor):
|
class ColaProcessor(DataProcessor):
|
||||||
"""Processor for the CoLA data set (GLUE version)."""
|
"""Processor for the CoLA data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence'].numpy().decode('utf-8'),
|
||||||
|
None,
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -196,6 +280,13 @@ class ColaProcessor(DataProcessor):
|
|||||||
class Sst2Processor(DataProcessor):
|
class Sst2Processor(DataProcessor):
|
||||||
"""Processor for the SST-2 data set (GLUE version)."""
|
"""Processor for the SST-2 data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence'].numpy().decode('utf-8'),
|
||||||
|
None,
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -227,6 +318,13 @@ class Sst2Processor(DataProcessor):
|
|||||||
class StsbProcessor(DataProcessor):
|
class StsbProcessor(DataProcessor):
|
||||||
"""Processor for the STS-B data set (GLUE version)."""
|
"""Processor for the STS-B data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -259,6 +357,13 @@ class StsbProcessor(DataProcessor):
|
|||||||
class QqpProcessor(DataProcessor):
|
class QqpProcessor(DataProcessor):
|
||||||
"""Processor for the QQP data set (GLUE version)."""
|
"""Processor for the QQP data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['question1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['question2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -294,6 +399,13 @@ class QqpProcessor(DataProcessor):
|
|||||||
class QnliProcessor(DataProcessor):
|
class QnliProcessor(DataProcessor):
|
||||||
"""Processor for the QNLI data set (GLUE version)."""
|
"""Processor for the QNLI data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['question'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -302,7 +414,7 @@ class QnliProcessor(DataProcessor):
|
|||||||
def get_dev_examples(self, data_dir):
|
def get_dev_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
|
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
|
||||||
"dev_matched")
|
"dev_matched")
|
||||||
|
|
||||||
def get_labels(self):
|
def get_labels(self):
|
||||||
@@ -327,6 +439,13 @@ class QnliProcessor(DataProcessor):
|
|||||||
class RteProcessor(DataProcessor):
|
class RteProcessor(DataProcessor):
|
||||||
"""Processor for the RTE data set (GLUE version)."""
|
"""Processor for the RTE data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -359,6 +478,13 @@ class RteProcessor(DataProcessor):
|
|||||||
class WnliProcessor(DataProcessor):
|
class WnliProcessor(DataProcessor):
|
||||||
"""Processor for the WNLI data set (GLUE version)."""
|
"""Processor for the WNLI data set (GLUE version)."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""See base class."""
|
||||||
|
return InputExample(tensor_dict['idx'].numpy(),
|
||||||
|
tensor_dict['sentence1'].numpy().decode('utf-8'),
|
||||||
|
tensor_dict['sentence2'].numpy().decode('utf-8'),
|
||||||
|
str(tensor_dict['label'].numpy()))
|
||||||
|
|
||||||
def get_train_examples(self, data_dir):
|
def get_train_examples(self, data_dir):
|
||||||
"""See base class."""
|
"""See base class."""
|
||||||
return self._create_examples(
|
return self._create_examples(
|
||||||
@@ -387,198 +513,19 @@ class WnliProcessor(DataProcessor):
|
|||||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
return examples
|
return examples
|
||||||
|
|
||||||
|
glue_tasks_num_labels = {
|
||||||
|
"cola": 2,
|
||||||
|
"mnli": 3,
|
||||||
|
"mrpc": 2,
|
||||||
|
"sst-2": 2,
|
||||||
|
"sts-b": 1,
|
||||||
|
"qqp": 2,
|
||||||
|
"qnli": 2,
|
||||||
|
"rte": 2,
|
||||||
|
"wnli": 2,
|
||||||
|
}
|
||||||
|
|
||||||
def convert_examples_to_features(examples, label_list, max_seq_length,
|
glue_processors = {
|
||||||
tokenizer, output_mode,
|
|
||||||
cls_token_at_end=False,
|
|
||||||
cls_token='[CLS]',
|
|
||||||
cls_token_segment_id=1,
|
|
||||||
sep_token='[SEP]',
|
|
||||||
sep_token_extra=False,
|
|
||||||
pad_on_left=False,
|
|
||||||
pad_token=0,
|
|
||||||
pad_token_segment_id=0,
|
|
||||||
sequence_a_segment_id=0,
|
|
||||||
sequence_b_segment_id=1,
|
|
||||||
mask_padding_with_zero=True):
|
|
||||||
""" Loads a data file into a list of `InputBatch`s
|
|
||||||
`cls_token_at_end` define the location of the CLS token:
|
|
||||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
|
||||||
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
|
|
||||||
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
|
|
||||||
"""
|
|
||||||
|
|
||||||
label_map = {label : i for i, label in enumerate(label_list)}
|
|
||||||
|
|
||||||
features = []
|
|
||||||
for (ex_index, example) in enumerate(examples):
|
|
||||||
if ex_index % 10000 == 0:
|
|
||||||
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
|
|
||||||
|
|
||||||
tokens_a = tokenizer.tokenize(example.text_a)
|
|
||||||
|
|
||||||
tokens_b = None
|
|
||||||
if example.text_b:
|
|
||||||
tokens_b = tokenizer.tokenize(example.text_b)
|
|
||||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
|
||||||
# length is less than the specified length.
|
|
||||||
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
|
|
||||||
special_tokens_count = 4 if sep_token_extra else 3
|
|
||||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
|
|
||||||
else:
|
|
||||||
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
|
|
||||||
special_tokens_count = 3 if sep_token_extra else 2
|
|
||||||
if len(tokens_a) > max_seq_length - special_tokens_count:
|
|
||||||
tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
|
|
||||||
|
|
||||||
# The convention in BERT is:
|
|
||||||
# (a) For sequence pairs:
|
|
||||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
|
||||||
# (b) For single sequences:
|
|
||||||
# tokens: [CLS] the dog is hairy . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0
|
|
||||||
#
|
|
||||||
# Where "type_ids" are used to indicate whether this is the first
|
|
||||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
|
||||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
|
||||||
# embedding vector (and position vector). This is not *strictly* necessary
|
|
||||||
# since the [SEP] token unambiguously separates the sequences, but it makes
|
|
||||||
# it easier for the model to learn the concept of sequences.
|
|
||||||
#
|
|
||||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
|
||||||
# used as as the "sentence vector". Note that this only makes sense because
|
|
||||||
# the entire model is fine-tuned.
|
|
||||||
tokens = tokens_a + [sep_token]
|
|
||||||
if sep_token_extra:
|
|
||||||
# roberta uses an extra separator b/w pairs of sentences
|
|
||||||
tokens += [sep_token]
|
|
||||||
segment_ids = [sequence_a_segment_id] * len(tokens)
|
|
||||||
|
|
||||||
if tokens_b:
|
|
||||||
tokens += tokens_b + [sep_token]
|
|
||||||
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
|
|
||||||
|
|
||||||
if cls_token_at_end:
|
|
||||||
tokens = tokens + [cls_token]
|
|
||||||
segment_ids = segment_ids + [cls_token_segment_id]
|
|
||||||
else:
|
|
||||||
tokens = [cls_token] + tokens
|
|
||||||
segment_ids = [cls_token_segment_id] + segment_ids
|
|
||||||
|
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
||||||
|
|
||||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
|
||||||
# tokens are attended to.
|
|
||||||
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
|
||||||
|
|
||||||
# Zero-pad up to the sequence length.
|
|
||||||
padding_length = max_seq_length - len(input_ids)
|
|
||||||
if pad_on_left:
|
|
||||||
input_ids = ([pad_token] * padding_length) + input_ids
|
|
||||||
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
|
|
||||||
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
|
|
||||||
else:
|
|
||||||
input_ids = input_ids + ([pad_token] * padding_length)
|
|
||||||
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
|
|
||||||
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
|
|
||||||
|
|
||||||
assert len(input_ids) == max_seq_length
|
|
||||||
assert len(input_mask) == max_seq_length
|
|
||||||
assert len(segment_ids) == max_seq_length
|
|
||||||
|
|
||||||
if output_mode == "classification":
|
|
||||||
label_id = label_map[example.label]
|
|
||||||
elif output_mode == "regression":
|
|
||||||
label_id = float(example.label)
|
|
||||||
else:
|
|
||||||
raise KeyError(output_mode)
|
|
||||||
|
|
||||||
if ex_index < 5:
|
|
||||||
logger.info("*** Example ***")
|
|
||||||
logger.info("guid: %s" % (example.guid))
|
|
||||||
logger.info("tokens: %s" % " ".join(
|
|
||||||
[str(x) for x in tokens]))
|
|
||||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
|
||||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
|
||||||
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
|
||||||
logger.info("label: %s (id = %d)" % (example.label, label_id))
|
|
||||||
|
|
||||||
features.append(
|
|
||||||
InputFeatures(input_ids=input_ids,
|
|
||||||
input_mask=input_mask,
|
|
||||||
segment_ids=segment_ids,
|
|
||||||
label_id=label_id))
|
|
||||||
return features
|
|
||||||
|
|
||||||
|
|
||||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
|
||||||
"""Truncates a sequence pair in place to the maximum length."""
|
|
||||||
|
|
||||||
# This is a simple heuristic which will always truncate the longer sequence
|
|
||||||
# one token at a time. This makes more sense than truncating an equal percent
|
|
||||||
# of tokens from each, since if one sequence is very short then each token
|
|
||||||
# that's truncated likely contains more information than a longer sequence.
|
|
||||||
while True:
|
|
||||||
total_length = len(tokens_a) + len(tokens_b)
|
|
||||||
if total_length <= max_length:
|
|
||||||
break
|
|
||||||
if len(tokens_a) > len(tokens_b):
|
|
||||||
tokens_a.pop()
|
|
||||||
else:
|
|
||||||
tokens_b.pop()
|
|
||||||
|
|
||||||
|
|
||||||
def simple_accuracy(preds, labels):
|
|
||||||
return (preds == labels).mean()
|
|
||||||
|
|
||||||
|
|
||||||
def acc_and_f1(preds, labels):
|
|
||||||
acc = simple_accuracy(preds, labels)
|
|
||||||
f1 = f1_score(y_true=labels, y_pred=preds)
|
|
||||||
return {
|
|
||||||
"acc": acc,
|
|
||||||
"f1": f1,
|
|
||||||
"acc_and_f1": (acc + f1) / 2,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def pearson_and_spearman(preds, labels):
|
|
||||||
pearson_corr = pearsonr(preds, labels)[0]
|
|
||||||
spearman_corr = spearmanr(preds, labels)[0]
|
|
||||||
return {
|
|
||||||
"pearson": pearson_corr,
|
|
||||||
"spearmanr": spearman_corr,
|
|
||||||
"corr": (pearson_corr + spearman_corr) / 2,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def compute_metrics(task_name, preds, labels):
|
|
||||||
assert len(preds) == len(labels)
|
|
||||||
if task_name == "cola":
|
|
||||||
return {"mcc": matthews_corrcoef(labels, preds)}
|
|
||||||
elif task_name == "sst-2":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "mrpc":
|
|
||||||
return acc_and_f1(preds, labels)
|
|
||||||
elif task_name == "sts-b":
|
|
||||||
return pearson_and_spearman(preds, labels)
|
|
||||||
elif task_name == "qqp":
|
|
||||||
return acc_and_f1(preds, labels)
|
|
||||||
elif task_name == "mnli":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "mnli-mm":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "qnli":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "rte":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
elif task_name == "wnli":
|
|
||||||
return {"acc": simple_accuracy(preds, labels)}
|
|
||||||
else:
|
|
||||||
raise KeyError(task_name)
|
|
||||||
|
|
||||||
processors = {
|
|
||||||
"cola": ColaProcessor,
|
"cola": ColaProcessor,
|
||||||
"mnli": MnliProcessor,
|
"mnli": MnliProcessor,
|
||||||
"mnli-mm": MnliMismatchedProcessor,
|
"mnli-mm": MnliMismatchedProcessor,
|
||||||
@@ -591,7 +538,7 @@ processors = {
|
|||||||
"wnli": WnliProcessor,
|
"wnli": WnliProcessor,
|
||||||
}
|
}
|
||||||
|
|
||||||
output_modes = {
|
glue_output_modes = {
|
||||||
"cola": "classification",
|
"cola": "classification",
|
||||||
"mnli": "classification",
|
"mnli": "classification",
|
||||||
"mnli-mm": "classification",
|
"mnli-mm": "classification",
|
||||||
@@ -603,15 +550,3 @@ output_modes = {
|
|||||||
"rte": "classification",
|
"rte": "classification",
|
||||||
"wnli": "classification",
|
"wnli": "classification",
|
||||||
}
|
}
|
||||||
|
|
||||||
GLUE_TASKS_NUM_LABELS = {
|
|
||||||
"cola": 2,
|
|
||||||
"mnli": 3,
|
|
||||||
"mrpc": 2,
|
|
||||||
"sst-2": 2,
|
|
||||||
"sts-b": 1,
|
|
||||||
"qqp": 2,
|
|
||||||
"qnli": 2,
|
|
||||||
"rte": 2,
|
|
||||||
"wnli": 2,
|
|
||||||
}
|
|
||||||
120
transformers/data/processors/utils.py
Normal file
120
transformers/data/processors/utils.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
|
||||||
|
class InputExample(object):
|
||||||
|
"""
|
||||||
|
A single training/test example for simple sequence classification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
guid: Unique id for the example.
|
||||||
|
text_a: string. The untokenized text of the first sequence. For single
|
||||||
|
sequence tasks, only this sequence must be specified.
|
||||||
|
text_b: (Optional) string. The untokenized text of the second sequence.
|
||||||
|
Only must be specified for sequence pair tasks.
|
||||||
|
label: (Optional) string. The label of the example. This should be
|
||||||
|
specified for train and dev examples, but not for test examples.
|
||||||
|
"""
|
||||||
|
def __init__(self, guid, text_a, text_b=None, label=None):
|
||||||
|
self.guid = guid
|
||||||
|
self.text_a = text_a
|
||||||
|
self.text_b = text_b
|
||||||
|
self.label = label
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.to_json_string())
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""Serializes this instance to a Python dictionary."""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""Serializes this instance to a JSON string."""
|
||||||
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
class InputFeatures(object):
|
||||||
|
"""
|
||||||
|
A single set of features of data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||||
|
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
|
||||||
|
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
label: Label corresponding to the input
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input_ids, attention_mask, token_type_ids, label):
|
||||||
|
self.input_ids = input_ids
|
||||||
|
self.attention_mask = attention_mask
|
||||||
|
self.token_type_ids = token_type_ids
|
||||||
|
self.label = label
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.to_json_string())
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""Serializes this instance to a Python dictionary."""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""Serializes this instance to a JSON string."""
|
||||||
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
class DataProcessor(object):
|
||||||
|
"""Base class for data converters for sequence classification data sets."""
|
||||||
|
|
||||||
|
def get_example_from_tensor_dict(self, tensor_dict):
|
||||||
|
"""Gets an example from a dict with tensorflow tensors
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tensor_dict: Keys and values should match the corresponding Glue
|
||||||
|
tensorflow_dataset examples.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_train_examples(self, data_dir):
|
||||||
|
"""Gets a collection of `InputExample`s for the train set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_dev_examples(self, data_dir):
|
||||||
|
"""Gets a collection of `InputExample`s for the dev set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
"""Gets the list of labels for this data set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _read_tsv(cls, input_file, quotechar=None):
|
||||||
|
"""Reads a tab separated value file."""
|
||||||
|
with open(input_file, "r", encoding="utf-8-sig") as f:
|
||||||
|
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||||
|
lines = []
|
||||||
|
for line in reader:
|
||||||
|
if sys.version_info[0] == 2:
|
||||||
|
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||||
|
lines.append(line)
|
||||||
|
return lines
|
||||||
@@ -23,6 +23,24 @@ from botocore.exceptions import ClientError
|
|||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
assert int(tf.__version__[0]) >= 2
|
||||||
|
_tf_available = True # pylint: disable=invalid-name
|
||||||
|
logger.info("TensorFlow version {} available.".format(tf.__version__))
|
||||||
|
except (ImportError, AssertionError):
|
||||||
|
_tf_available = False # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
_torch_available = True # pylint: disable=invalid-name
|
||||||
|
logger.info("PyTorch version {} available.".format(torch.__version__))
|
||||||
|
except ImportError:
|
||||||
|
_torch_available = False # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from torch.hub import _get_torch_home
|
from torch.hub import _get_torch_home
|
||||||
torch_cache_home = _get_torch_home()
|
torch_cache_home = _get_torch_home()
|
||||||
@@ -30,7 +48,7 @@ except ImportError:
|
|||||||
torch_cache_home = os.path.expanduser(
|
torch_cache_home = os.path.expanduser(
|
||||||
os.getenv('TORCH_HOME', os.path.join(
|
os.getenv('TORCH_HOME', os.path.join(
|
||||||
os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
|
os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
|
||||||
default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
|
default_cache_path = os.path.join(torch_cache_home, 'transformers')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@@ -47,12 +65,18 @@ except (AttributeError, ImportError):
|
|||||||
default_cache_path))
|
default_cache_path))
|
||||||
|
|
||||||
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
||||||
|
TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
||||||
|
|
||||||
WEIGHTS_NAME = "pytorch_model.bin"
|
WEIGHTS_NAME = "pytorch_model.bin"
|
||||||
|
TF2_WEIGHTS_NAME = 'tf_model.h5'
|
||||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||||
CONFIG_NAME = "config.json"
|
CONFIG_NAME = "config.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
def is_torch_available():
|
||||||
|
return _torch_available
|
||||||
|
|
||||||
|
def is_tf_available():
|
||||||
|
return _tf_available
|
||||||
|
|
||||||
if not six.PY2:
|
if not six.PY2:
|
||||||
def add_start_docstrings(*docstr):
|
def add_start_docstrings(*docstr):
|
||||||
@@ -83,6 +107,9 @@ def url_to_filename(url, etag=None):
|
|||||||
Convert `url` into a hashed filename in a repeatable way.
|
Convert `url` into a hashed filename in a repeatable way.
|
||||||
If `etag` is specified, append its hash to the url's, delimited
|
If `etag` is specified, append its hash to the url's, delimited
|
||||||
by a period.
|
by a period.
|
||||||
|
If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
|
||||||
|
so that TF 2.0 can identify it as a HDF5 file
|
||||||
|
(see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
|
||||||
"""
|
"""
|
||||||
url_bytes = url.encode('utf-8')
|
url_bytes = url.encode('utf-8')
|
||||||
url_hash = sha256(url_bytes)
|
url_hash = sha256(url_bytes)
|
||||||
@@ -93,6 +120,9 @@ def url_to_filename(url, etag=None):
|
|||||||
etag_hash = sha256(etag_bytes)
|
etag_hash = sha256(etag_bytes)
|
||||||
filename += '.' + etag_hash.hexdigest()
|
filename += '.' + etag_hash.hexdigest()
|
||||||
|
|
||||||
|
if url.endswith('.h5'):
|
||||||
|
filename += '.h5'
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
@@ -102,7 +132,7 @@ def filename_to_url(filename, cache_dir=None):
|
|||||||
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
|
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||||
cache_dir = str(cache_dir)
|
cache_dir = str(cache_dir)
|
||||||
|
|
||||||
@@ -133,7 +163,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
||||||
url_or_filename = str(url_or_filename)
|
url_or_filename = str(url_or_filename)
|
||||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||||
@@ -222,7 +252,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
|
|||||||
If it's not there, download it. Then return the path to the cached file.
|
If it's not there, download it. Then return the path to the cached file.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||||
cache_dir = str(cache_dir)
|
cache_dir = str(cache_dir)
|
||||||
if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
|
if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
|
||||||
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class AutoModel(object):
|
class AutoModel(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModel` is a generic model class
|
:class:`~transformers.AutoModel` is a generic model class
|
||||||
that will be instantiated as one of the base model classes of the library
|
that will be instantiated as one of the base model classes of the library
|
||||||
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -84,23 +84,23 @@ class AutoModel(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -120,7 +120,7 @@ class AutoModel(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -157,7 +157,7 @@ class AutoModel(object):
|
|||||||
|
|
||||||
class AutoModelWithLMHead(object):
|
class AutoModelWithLMHead(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
|
:class:`~transformers.AutoModelWithLMHead` is a generic model class
|
||||||
that will be instantiated as one of the language modeling model classes of the library
|
that will be instantiated as one of the language modeling model classes of the library
|
||||||
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -208,23 +208,23 @@ class AutoModelWithLMHead(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -244,7 +244,7 @@ class AutoModelWithLMHead(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -281,7 +281,7 @@ class AutoModelWithLMHead(object):
|
|||||||
|
|
||||||
class AutoModelForSequenceClassification(object):
|
class AutoModelForSequenceClassification(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
|
:class:`~transformers.AutoModelForSequenceClassification` is a generic model class
|
||||||
that will be instantiated as one of the sequence classification model classes of the library
|
that will be instantiated as one of the sequence classification model classes of the library
|
||||||
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -326,23 +326,23 @@ class AutoModelForSequenceClassification(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -362,7 +362,7 @@ class AutoModelForSequenceClassification(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -392,7 +392,7 @@ class AutoModelForSequenceClassification(object):
|
|||||||
|
|
||||||
class AutoModelForQuestionAnswering(object):
|
class AutoModelForQuestionAnswering(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
|
:class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
|
||||||
that will be instantiated as one of the question answering model classes of the library
|
that will be instantiated as one of the question answering model classes of the library
|
||||||
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
@@ -435,23 +435,23 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
model_args: (`optional`) Sequence of positional arguments:
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
@@ -471,7 +471,7 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -118,26 +118,27 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
|||||||
|
|
||||||
|
|
||||||
def gelu(x):
|
def gelu(x):
|
||||||
"""Implementation of the gelu activation function.
|
""" Original Implementation of the gelu activation function in Google Bert repo when initialy created.
|
||||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||||
Also see https://arxiv.org/abs/1606.08415
|
Also see https://arxiv.org/abs/1606.08415
|
||||||
"""
|
"""
|
||||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||||
|
|
||||||
|
def gelu_new(x):
|
||||||
|
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
|
||||||
|
Also see https://arxiv.org/abs/1606.08415
|
||||||
|
"""
|
||||||
|
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||||
|
|
||||||
def swish(x):
|
def swish(x):
|
||||||
return x * torch.sigmoid(x)
|
return x * torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
|
||||||
|
|
||||||
|
|
||||||
try:
|
BertLayerNorm = torch.nn.LayerNorm
|
||||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
|
||||||
except (ImportError, AttributeError) as e:
|
|
||||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
|
||||||
BertLayerNorm = torch.nn.LayerNorm
|
|
||||||
|
|
||||||
class BertEmbeddings(nn.Module):
|
class BertEmbeddings(nn.Module):
|
||||||
"""Construct the embeddings from word, position and token_type embeddings.
|
"""Construct the embeddings from word, position and token_type embeddings.
|
||||||
@@ -195,7 +196,7 @@ class BertSelfAttention(nn.Module):
|
|||||||
x = x.view(*new_x_shape)
|
x = x.view(*new_x_shape)
|
||||||
return x.permute(0, 2, 1, 3)
|
return x.permute(0, 2, 1, 3)
|
||||||
|
|
||||||
def forward(self, hidden_states, attention_mask, head_mask=None):
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
mixed_query_layer = self.query(hidden_states)
|
mixed_query_layer = self.query(hidden_states)
|
||||||
mixed_key_layer = self.key(hidden_states)
|
mixed_key_layer = self.key(hidden_states)
|
||||||
mixed_value_layer = self.value(hidden_states)
|
mixed_value_layer = self.value(hidden_states)
|
||||||
@@ -207,8 +208,9 @@ class BertSelfAttention(nn.Module):
|
|||||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
if attention_mask is not None:
|
||||||
attention_scores = attention_scores + attention_mask
|
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
# Normalize the attention scores to probabilities.
|
# Normalize the attention scores to probabilities.
|
||||||
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||||
@@ -275,7 +277,7 @@ class BertAttention(nn.Module):
|
|||||||
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
||||||
self.pruned_heads = self.pruned_heads.union(heads)
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def forward(self, input_tensor, attention_mask, head_mask=None):
|
def forward(self, input_tensor, attention_mask=None, head_mask=None):
|
||||||
self_outputs = self.self(input_tensor, attention_mask, head_mask)
|
self_outputs = self.self(input_tensor, attention_mask, head_mask)
|
||||||
attention_output = self.output(self_outputs[0], input_tensor)
|
attention_output = self.output(self_outputs[0], input_tensor)
|
||||||
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
|
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
|
||||||
@@ -318,7 +320,7 @@ class BertLayer(nn.Module):
|
|||||||
self.intermediate = BertIntermediate(config)
|
self.intermediate = BertIntermediate(config)
|
||||||
self.output = BertOutput(config)
|
self.output = BertOutput(config)
|
||||||
|
|
||||||
def forward(self, hidden_states, attention_mask, head_mask=None):
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
|
attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
|
||||||
attention_output = attention_outputs[0]
|
attention_output = attention_outputs[0]
|
||||||
intermediate_output = self.intermediate(attention_output)
|
intermediate_output = self.intermediate(attention_output)
|
||||||
@@ -334,7 +336,7 @@ class BertEncoder(nn.Module):
|
|||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
|
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
|
||||||
|
|
||||||
def forward(self, hidden_states, attention_mask, head_mask=None):
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
all_hidden_states = ()
|
all_hidden_states = ()
|
||||||
all_attentions = ()
|
all_attentions = ()
|
||||||
for i, layer_module in enumerate(self.layer):
|
for i, layer_module in enumerate(self.layer):
|
||||||
@@ -480,9 +482,9 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
BERT_INPUTS_DOCSTRING = r"""
|
BERT_INPUTS_DOCSTRING = r"""
|
||||||
@@ -506,9 +508,9 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
Indices can be obtained using :class:`transformers.BertTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -372,9 +372,9 @@ DISTILBERT_START_DOCSTRING = r"""
|
|||||||
https://medium.com/huggingface/distilbert-8cf3380435b5
|
https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DISTILBERT_INPUTS_DOCSTRING = r"""
|
DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||||
@@ -280,9 +280,9 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
||||||
@@ -290,9 +290,9 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices of input sequence tokens in the vocabulary.
|
Indices of input sequence tokens in the vocabulary.
|
||||||
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
|
Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
@@ -367,6 +367,13 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
input_ids = input_ids.view(-1, input_shape[-1])
|
||||||
|
if token_type_ids is not None:
|
||||||
|
token_type_ids = token_type_ids.view(-1, input_shape[-1])
|
||||||
|
if position_ids is not None:
|
||||||
|
position_ids = position_ids.view(-1, input_shape[-1])
|
||||||
|
|
||||||
if past is None:
|
if past is None:
|
||||||
past_length = 0
|
past_length = 0
|
||||||
past = [None] * len(self.h)
|
past = [None] * len(self.h)
|
||||||
@@ -378,6 +385,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
|
|
||||||
# Attention mask.
|
# Attention mask.
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
|
attention_mask = attention_mask.view(-1, input_shape[-1])
|
||||||
# We create a 3D attention mask from a 2D tensor mask.
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
@@ -407,14 +415,9 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
head_mask = [None] * self.config.n_layer
|
head_mask = [None] * self.config.n_layer
|
||||||
|
|
||||||
input_shape = input_ids.size()
|
|
||||||
input_ids = input_ids.view(-1, input_ids.size(-1))
|
|
||||||
position_ids = position_ids.view(-1, position_ids.size(-1))
|
|
||||||
|
|
||||||
inputs_embeds = self.wte(input_ids)
|
inputs_embeds = self.wte(input_ids)
|
||||||
position_embeds = self.wpe(position_ids)
|
position_embeds = self.wpe(position_ids)
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
|
|
||||||
token_type_embeds = self.wte(token_type_ids)
|
token_type_embeds = self.wte(token_type_ids)
|
||||||
else:
|
else:
|
||||||
token_type_embeds = 0
|
token_type_embeds = 0
|
||||||
@@ -490,7 +493,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||||
@@ -586,7 +589,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
||||||
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
||||||
@@ -294,9 +294,9 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
||||||
@@ -304,9 +304,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices of input sequence tokens in the vocabulary.
|
Indices of input sequence tokens in the vocabulary.
|
||||||
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -43,6 +43,9 @@ class RobertaEmbeddings(BertEmbeddings):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RobertaEmbeddings, self).__init__(config)
|
super(RobertaEmbeddings, self).__init__(config)
|
||||||
self.padding_idx = 1
|
self.padding_idx = 1
|
||||||
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
|
||||||
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
|
||||||
|
padding_idx=self.padding_idx)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, position_ids=None):
|
def forward(self, input_ids, token_type_ids=None, position_ids=None):
|
||||||
seq_length = input_ids.size(1)
|
seq_length = input_ids.size(1)
|
||||||
@@ -77,9 +80,9 @@ ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
|||||||
https://pytorch.org/docs/stable/nn.html#module
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the
|
config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
|
||||||
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ROBERTA_INPUTS_DOCSTRING = r"""
|
ROBERTA_INPUTS_DOCSTRING = r"""
|
||||||
@@ -102,8 +105,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
|||||||
RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -361,9 +364,9 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
|||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0``
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
Indices can be obtained using :class:`transformers.BertTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||||
Segment token indices to indicate first and second portions of the inputs.
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
||||||
501
transformers/modeling_tf_auto.py
Normal file
501
transformers/modeling_tf_auto.py
Normal file
@@ -0,0 +1,501 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Auto Model class. """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
|
||||||
|
from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
|
||||||
|
from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
|
||||||
|
from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
|
||||||
|
from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
|
||||||
|
from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
|
||||||
|
from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
|
||||||
|
from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
|
||||||
|
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModel(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModel` is a generic model class
|
||||||
|
that will be instantiated as one of the base model classes of the library
|
||||||
|
when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertModel (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMModel (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModel is designed to be instantiated "
|
||||||
|
"using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the base model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
||||||
|
- contains `bert`: TFTFBertModel (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMModel (XLM model)
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||||
|
return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'gpt2' in pretrained_model_name_or_path:
|
||||||
|
return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||||
|
return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
|
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModelWithLMHead(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModelWithLMHead` is a generic model class
|
||||||
|
that will be instantiated as one of the language modeling model classes of the library
|
||||||
|
when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForMaskedLM (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMWithLMHeadModel (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the language modeling model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForMaskedLM (Bert model)
|
||||||
|
- contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMWithLMHeadModel (XLM model)
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||||
|
return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'gpt2' in pretrained_model_name_or_path:
|
||||||
|
return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||||
|
return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
|
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModelForSequenceClassification(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
|
||||||
|
that will be instantiated as one of the sequence classification model classes of the library
|
||||||
|
when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForSequenceClassification (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForSequenceClassification (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the sequence classification model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
|
||||||
|
- contains `bert`: TFBertForSequenceClassification (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForSequenceClassification (XLM model)
|
||||||
|
|
||||||
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class TFAutoModelForQuestionAnswering(object):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
|
||||||
|
that will be instantiated as one of the question answering model classes of the library
|
||||||
|
when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `bert`: TFBertForQuestionAnswering (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForQuestionAnswering (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the question answering model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `bert`: TFBertForQuestionAnswering (Bert model)
|
||||||
|
- contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
|
||||||
|
- contains `xlm`: TFXLMForQuestionAnswering (XLM model)
|
||||||
|
|
||||||
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
from_pt: (`Optional`) Boolean
|
||||||
|
Set to True if the Checkpoint is a PyTorch checkpoint.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
|
||||||
1044
transformers/modeling_tf_bert.py
Normal file
1044
transformers/modeling_tf_bert.py
Normal file
File diff suppressed because it is too large
Load Diff
745
transformers/modeling_tf_distilbert.py
Normal file
745
transformers/modeling_tf_distilbert.py
Normal file
@@ -0,0 +1,745 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 DistilBERT model
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import copy
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .configuration_distilbert import DistilBertConfig
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
||||||
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
|
||||||
|
def gelu(x):
|
||||||
|
""" Gaussian Error Linear Unit.
|
||||||
|
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
|
||||||
|
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||||
|
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||||
|
Also see https://arxiv.org/abs/1606.08415
|
||||||
|
"""
|
||||||
|
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
|
||||||
|
return x * cdf
|
||||||
|
|
||||||
|
def gelu_new(x):
|
||||||
|
"""Gaussian Error Linear Unit.
|
||||||
|
This is a smoother version of the RELU.
|
||||||
|
Original paper: https://arxiv.org/abs/1606.08415
|
||||||
|
Args:
|
||||||
|
x: float Tensor to perform activation.
|
||||||
|
Returns:
|
||||||
|
`x` with the GELU activation applied.
|
||||||
|
"""
|
||||||
|
cdf = 0.5 * (1.0 + tf.tanh(
|
||||||
|
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
|
||||||
|
return x * cdf
|
||||||
|
|
||||||
|
def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||||
|
# build the network
|
||||||
|
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
|
||||||
|
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
||||||
|
tf_inputs = [inputs_list, attns_list]
|
||||||
|
tfo = tf_model(tf_inputs, training=False)
|
||||||
|
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
|
||||||
|
|
||||||
|
class TFEmbeddings(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFEmbeddings, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
self.dim = config.dim
|
||||||
|
self.initializer_range = config.initializer_range
|
||||||
|
self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
|
||||||
|
config.dim,
|
||||||
|
initializer_range=config.initializer_range,
|
||||||
|
name='word_embeddings') # padding_idx=0)
|
||||||
|
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
|
||||||
|
config.dim,
|
||||||
|
embeddings_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='position_embeddings')
|
||||||
|
if config.sinusoidal_pos_embds:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
"""Build shared word embedding layer """
|
||||||
|
with tf.name_scope("word_embeddings"):
|
||||||
|
# Create and initialize weights. The random normal initializer was chosen
|
||||||
|
# arbitrarily, and works well.
|
||||||
|
self.word_embeddings = self.add_weight(
|
||||||
|
"weight",
|
||||||
|
shape=[self.vocab_size, self.dim],
|
||||||
|
initializer=get_initializer(self.initializer_range))
|
||||||
|
super(TFEmbeddings, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, inputs, mode="embedding", training=False):
|
||||||
|
"""Get token embeddings of inputs.
|
||||||
|
Args:
|
||||||
|
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
|
||||||
|
mode: string, a valid value is one of "embedding" and "linear".
|
||||||
|
Returns:
|
||||||
|
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
|
||||||
|
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
|
||||||
|
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||||
|
Raises:
|
||||||
|
ValueError: if mode is not valid.
|
||||||
|
|
||||||
|
Shared weights logic adapted from
|
||||||
|
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||||
|
"""
|
||||||
|
if mode == "embedding":
|
||||||
|
return self._embedding(inputs, training=training)
|
||||||
|
elif mode == "linear":
|
||||||
|
return self._linear(inputs)
|
||||||
|
else:
|
||||||
|
raise ValueError("mode {} is not valid.".format(mode))
|
||||||
|
|
||||||
|
def _embedding(self, inputs, training=False):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
input_ids: tf.Tensor(bs, max_seq_length)
|
||||||
|
The token ids to embed.
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
embeddings: tf.Tensor(bs, max_seq_length, dim)
|
||||||
|
The embedded tokens (plus position embeddings, no token_type embeddings)
|
||||||
|
"""
|
||||||
|
if not isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs
|
||||||
|
position_ids = None
|
||||||
|
else:
|
||||||
|
input_ids, position_ids = inputs
|
||||||
|
|
||||||
|
seq_length = tf.shape(input_ids)[1]
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|
||||||
|
word_embeddings = tf.gather(self.word_embeddings, input_ids)
|
||||||
|
position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim)
|
||||||
|
|
||||||
|
embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim)
|
||||||
|
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
|
||||||
|
embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim)
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
def _linear(self, inputs):
|
||||||
|
"""Computes logits by running inputs through a linear layer.
|
||||||
|
Args:
|
||||||
|
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
|
||||||
|
Returns:
|
||||||
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
|
"""
|
||||||
|
batch_size = tf.shape(inputs)[0]
|
||||||
|
length = tf.shape(inputs)[1]
|
||||||
|
|
||||||
|
x = tf.reshape(inputs, [-1, self.dim])
|
||||||
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
|
|
||||||
|
return tf.reshape(logits, [batch_size, length, self.vocab_size])
|
||||||
|
|
||||||
|
|
||||||
|
class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFMultiHeadSelfAttention, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.n_heads = config.n_heads
|
||||||
|
self.dim = config.dim
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
assert self.dim % self.n_heads == 0
|
||||||
|
|
||||||
|
self.q_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="q_lin")
|
||||||
|
self.k_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="k_lin")
|
||||||
|
self.v_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="v_lin")
|
||||||
|
self.out_lin = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="out_lin")
|
||||||
|
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query: tf.Tensor(bs, seq_length, dim)
|
||||||
|
key: tf.Tensor(bs, seq_length, dim)
|
||||||
|
value: tf.Tensor(bs, seq_length, dim)
|
||||||
|
mask: tf.Tensor(bs, seq_length)
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
|
||||||
|
Attention weights
|
||||||
|
context: tf.Tensor(bs, seq_length, dim)
|
||||||
|
Contextualized layer. Optional: only if `output_attentions=True`
|
||||||
|
"""
|
||||||
|
query, key, value, mask, head_mask = inputs
|
||||||
|
bs, q_length, dim = shape_list(query)
|
||||||
|
k_length = shape_list(key)[1]
|
||||||
|
# assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
|
||||||
|
# assert key.size() == value.size()
|
||||||
|
|
||||||
|
dim_per_head = self.dim // self.n_heads
|
||||||
|
|
||||||
|
assert 2 <= len(tf.shape(mask)) <= 3
|
||||||
|
causal = (len(tf.shape(mask)) == 3)
|
||||||
|
mask_reshape = [bs, 1, 1, k_length]
|
||||||
|
|
||||||
|
def shape(x):
|
||||||
|
""" separate heads """
|
||||||
|
return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
|
||||||
|
|
||||||
|
def unshape(x):
|
||||||
|
""" group heads """
|
||||||
|
return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
|
||||||
|
|
||||||
|
q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head)
|
||||||
|
k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head)
|
||||||
|
v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head)
|
||||||
|
|
||||||
|
q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
|
||||||
|
scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length)
|
||||||
|
mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen)
|
||||||
|
# scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length)
|
||||||
|
scores = scores - 1e30 * (1.0 - mask)
|
||||||
|
|
||||||
|
weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen)
|
||||||
|
weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
weights = weights * head_mask
|
||||||
|
|
||||||
|
context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
context = unshape(context) # (bs, q_length, dim)
|
||||||
|
context = self.out_lin(context) # (bs, q_length, dim)
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
return (context, weights)
|
||||||
|
else:
|
||||||
|
return (context,)
|
||||||
|
|
||||||
|
class TFFFN(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFFFN, self).__init__(**kwargs)
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="lin1")
|
||||||
|
self.lin2 = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="lin2")
|
||||||
|
assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
|
||||||
|
self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
|
||||||
|
|
||||||
|
def call(self, input, training=False):
|
||||||
|
x = self.lin1(input)
|
||||||
|
x = self.activation(x)
|
||||||
|
x = self.lin2(x)
|
||||||
|
x = self.dropout(x, training=training)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class TFTransformerBlock(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFTransformerBlock, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.n_heads = config.n_heads
|
||||||
|
self.dim = config.dim
|
||||||
|
self.hidden_dim = config.hidden_dim
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
self.activation = config.activation
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
assert config.dim % config.n_heads == 0
|
||||||
|
|
||||||
|
self.attention = TFMultiHeadSelfAttention(config, name="attention")
|
||||||
|
self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
|
||||||
|
|
||||||
|
self.ffn = TFFFN(config, name="ffn")
|
||||||
|
self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
|
||||||
|
|
||||||
|
def call(self, inputs, training=False): # removed: src_enc=None, src_len=None
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x: tf.Tensor(bs, seq_length, dim)
|
||||||
|
attn_mask: tf.Tensor(bs, seq_length)
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
|
||||||
|
The attention weights
|
||||||
|
ffn_output: tf.Tensor(bs, seq_length, dim)
|
||||||
|
The output of the transformer block contextualization.
|
||||||
|
"""
|
||||||
|
x, attn_mask, head_mask = inputs
|
||||||
|
|
||||||
|
# Self-Attention
|
||||||
|
sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
|
||||||
|
if self.output_attentions:
|
||||||
|
sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
|
||||||
|
else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
|
||||||
|
# assert type(sa_output) == tuple
|
||||||
|
sa_output = sa_output[0]
|
||||||
|
sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim)
|
||||||
|
|
||||||
|
# Feed Forward Network
|
||||||
|
ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim)
|
||||||
|
ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim)
|
||||||
|
|
||||||
|
output = (ffn_output,)
|
||||||
|
if self.output_attentions:
|
||||||
|
output = (sa_weights,) + output
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class TFTransformer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFTransformer, self).__init__(**kwargs)
|
||||||
|
self.n_layers = config.n_layers
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
|
self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
|
||||||
|
for i in range(config.n_layers)]
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x: tf.Tensor(bs, seq_length, dim)
|
||||||
|
Input sequence embedded.
|
||||||
|
attn_mask: tf.Tensor(bs, seq_length)
|
||||||
|
Attention mask on the sequence.
|
||||||
|
|
||||||
|
Outputs
|
||||||
|
-------
|
||||||
|
hidden_state: tf.Tensor(bs, seq_length, dim)
|
||||||
|
Sequence of hiddens states in the last (top) layer
|
||||||
|
all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
|
||||||
|
Tuple of length n_layers with the hidden states from each layer.
|
||||||
|
Optional: only if output_hidden_states=True
|
||||||
|
all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
|
||||||
|
Tuple of length n_layers with the attention weights from each layer
|
||||||
|
Optional: only if output_attentions=True
|
||||||
|
"""
|
||||||
|
x, attn_mask, head_mask = inputs
|
||||||
|
|
||||||
|
all_hidden_states = ()
|
||||||
|
all_attentions = ()
|
||||||
|
|
||||||
|
hidden_state = x
|
||||||
|
for i, layer_module in enumerate(self.layer):
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||||
|
|
||||||
|
layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training)
|
||||||
|
hidden_state = layer_outputs[-1]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
assert len(layer_outputs) == 2
|
||||||
|
attentions = layer_outputs[0]
|
||||||
|
all_attentions = all_attentions + (attentions,)
|
||||||
|
else:
|
||||||
|
assert len(layer_outputs) == 1
|
||||||
|
|
||||||
|
# Add last layer
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||||
|
|
||||||
|
outputs = (hidden_state,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFDistilBertMainLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFDistilBertMainLayer, self).__init__(**kwargs)
|
||||||
|
self.num_hidden_layers = config.num_hidden_layers
|
||||||
|
|
||||||
|
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
|
||||||
|
self.transformer = TFTransformer(config, name="transformer") # Encoder
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, attention_mask=None, head_mask=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
|
head_mask = inputs[2] if len(inputs) > 2 else head_mask
|
||||||
|
assert len(inputs) <= 3, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
assert len(inputs) <= 3, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length)
|
||||||
|
attention_mask = tf.cast(attention_mask, dtype=tf.float32)
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if head_mask is not None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
|
||||||
|
embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim)
|
||||||
|
tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
|
||||||
|
|
||||||
|
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
|
||||||
|
class TFDistilBertPreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for downloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = DistilBertConfig
|
||||||
|
pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_pt_weights = load_distilbert_pt_weights_in_tf2
|
||||||
|
base_model_prefix = "distilbert"
|
||||||
|
|
||||||
|
|
||||||
|
DISTILBERT_START_DOCSTRING = r"""
|
||||||
|
DistilBERT is a small, fast, cheap and light Transformer model
|
||||||
|
trained by distilling Bert base. It has 40% less parameters than
|
||||||
|
`bert-base-uncased`, runs 60% faster while preserving over 95% of
|
||||||
|
Bert's performances as measured on the GLUE language understanding benchmark.
|
||||||
|
|
||||||
|
Here are the differences between the interface of Bert and DistilBert:
|
||||||
|
|
||||||
|
- DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
|
||||||
|
- DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
|
||||||
|
|
||||||
|
For more information on DistilBERT, please refer to our
|
||||||
|
`detailed blog post`_
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`detailed blog post`:
|
||||||
|
https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
|
||||||
|
|
||||||
|
For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertModel(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import DistilBertTokenizer, TFDistilBertModel
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.distilbert(inputs, **kwargs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFDistilBertLMHead(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
|
super(TFDistilBertLMHead, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
# The output weights are the same as the input embeddings, but there is
|
||||||
|
# an output-only bias for each token.
|
||||||
|
self.input_embeddings = input_embeddings
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
self.bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='bias')
|
||||||
|
super(TFDistilBertLMHead, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, hidden_states):
|
||||||
|
hidden_states = self.input_embeddings(hidden_states, mode="linear")
|
||||||
|
hidden_states = hidden_states + self.bias
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||||
|
prediction_scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
|
self.vocab_transform = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="vocab_transform")
|
||||||
|
self.act = tf.keras.layers.Activation(gelu)
|
||||||
|
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
|
||||||
|
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
distilbert_output = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
hidden_states = distilbert_output[0] # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
|
||||||
|
prediction_logits = self.vocab_projector(prediction_logits)
|
||||||
|
|
||||||
|
outputs = (prediction_logits,) + distilbert_output[1:]
|
||||||
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||||
|
the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import BertTokenizer, TFDistilBertForSequenceClassification
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
|
self.pre_classifier = tf.keras.layers.Dense(config.dim,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
activation='relu',
|
||||||
|
name="pre_classifier")
|
||||||
|
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="classifier")
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
distilbert_output = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
||||||
|
pooled_output = hidden_state[:, 0] # (bs, dim)
|
||||||
|
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
||||||
|
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) # (bs, dim)
|
||||||
|
logits = self.classifier(pooled_output) # (bs, dim)
|
||||||
|
|
||||||
|
outputs = (logits,) + distilbert_output[1:]
|
||||||
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||||
|
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-start scores (before SoftMax).
|
||||||
|
**end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-end scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import BertTokenizer, TFDistilBertForQuestionAnswering
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
start_positions = tf.constant([1])
|
||||||
|
end_positions = tf.constant([3])
|
||||||
|
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||||
|
start_scores, end_scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
|
||||||
|
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='qa_outputs')
|
||||||
|
assert config.num_labels == 2
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
distilbert_output = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
||||||
|
hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False)) # (bs, max_query_len, dim)
|
||||||
|
logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2)
|
||||||
|
start_logits, end_logits = tf.split(logits, 2, axis=-1)
|
||||||
|
start_logits = tf.squeeze(start_logits, axis=-1)
|
||||||
|
end_logits = tf.squeeze(end_logits, axis=-1)
|
||||||
|
|
||||||
|
outputs = (start_logits, end_logits,) + distilbert_output[1:]
|
||||||
|
return outputs # start_logits, end_logits, (hidden_states), (attentions)
|
||||||
613
transformers/modeling_tf_gpt2.py
Normal file
613
transformers/modeling_tf_gpt2.py
Normal file
@@ -0,0 +1,613 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 OpenAI GPT-2 model. """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import collections
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
|
||||||
|
TFSequenceSummary, shape_list, get_initializer)
|
||||||
|
from .configuration_gpt2 import GPT2Config
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
|
||||||
|
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
|
||||||
|
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
|
||||||
|
|
||||||
|
|
||||||
|
def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||||
|
# build the network
|
||||||
|
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
tf_inputs = tf.constant(inputs_list)
|
||||||
|
tfo = tf_model(tf_inputs, training=False)
|
||||||
|
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
def gelu(x):
|
||||||
|
"""Gaussian Error Linear Unit.
|
||||||
|
This is a smoother version of the RELU.
|
||||||
|
Original paper: https://arxiv.org/abs/1606.08415
|
||||||
|
Args:
|
||||||
|
x: float Tensor to perform activation.
|
||||||
|
Returns:
|
||||||
|
`x` with the GELU activation applied.
|
||||||
|
"""
|
||||||
|
cdf = 0.5 * (1.0 + tf.tanh(
|
||||||
|
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
|
||||||
|
return x * cdf
|
||||||
|
|
||||||
|
|
||||||
|
class TFAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
||||||
|
super(TFAttention, self).__init__(**kwargs)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||||
|
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||||
|
assert n_state % config.n_head == 0
|
||||||
|
self.n_ctx = n_ctx
|
||||||
|
self.n_head = config.n_head
|
||||||
|
self.split_size = n_state
|
||||||
|
self.scale = scale
|
||||||
|
|
||||||
|
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
|
||||||
|
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
|
||||||
|
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
|
||||||
|
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def causal_attention_mask(nd, ns, dtype):
|
||||||
|
"""1's in the lower triangle, counting from the lower right corner.
|
||||||
|
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
|
||||||
|
"""
|
||||||
|
i = tf.range(nd)[:,None]
|
||||||
|
j = tf.range(ns)
|
||||||
|
m = i >= j - ns + nd
|
||||||
|
return tf.cast(m, dtype)
|
||||||
|
|
||||||
|
def _attn(self, inputs, training=False):
|
||||||
|
q, k, v, attention_mask, head_mask = inputs
|
||||||
|
# q, k, v have shape [batch, heads, sequence, features]
|
||||||
|
w = tf.matmul(q, k, transpose_b=True)
|
||||||
|
if self.scale:
|
||||||
|
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
|
||||||
|
w = w / tf.math.sqrt(dk)
|
||||||
|
|
||||||
|
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
||||||
|
_, _, nd, ns = shape_list(w)
|
||||||
|
b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
|
||||||
|
b = tf.reshape(b, [1, 1, nd, ns])
|
||||||
|
w = w * b - 1e4 * (1 - b)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask
|
||||||
|
w = w + attention_mask
|
||||||
|
|
||||||
|
w = tf.nn.softmax(w, axis=-1)
|
||||||
|
w = self.attn_dropout(w, training=training)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
w = w * head_mask
|
||||||
|
|
||||||
|
outputs = [tf.matmul(w, v)]
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs.append(w)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def merge_heads(self, x):
|
||||||
|
x = tf.transpose(x, [0, 2, 1, 3])
|
||||||
|
x_shape = shape_list(x)
|
||||||
|
new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
|
||||||
|
return tf.reshape(x, new_x_shape)
|
||||||
|
|
||||||
|
def split_heads(self, x):
|
||||||
|
x_shape = shape_list(x)
|
||||||
|
new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
|
||||||
|
x = tf.reshape(x, new_x_shape)
|
||||||
|
return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features)
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
x, layer_past, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
x = self.c_attn(x)
|
||||||
|
query, key, value = tf.split(x, 3, axis=2)
|
||||||
|
query = self.split_heads(query)
|
||||||
|
key = self.split_heads(key)
|
||||||
|
value = self.split_heads(value)
|
||||||
|
if layer_past is not None:
|
||||||
|
past_key, past_value = tf.unstack(layer_past, axis=1)
|
||||||
|
key = tf.concat([past_key, key], axis=-2)
|
||||||
|
value = tf.concat([past_value, value], axis=-2)
|
||||||
|
present = tf.stack([key, value], axis=1)
|
||||||
|
|
||||||
|
attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
|
||||||
|
a = attn_outputs[0]
|
||||||
|
|
||||||
|
a = self.merge_heads(a)
|
||||||
|
a = self.c_proj(a)
|
||||||
|
a = self.resid_dropout(a, training=training)
|
||||||
|
|
||||||
|
outputs = [a, present] + attn_outputs[1:]
|
||||||
|
return outputs # a, present, (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFMLP(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, n_state, config, **kwargs):
|
||||||
|
super(TFMLP, self).__init__(**kwargs)
|
||||||
|
nx = config.n_embd
|
||||||
|
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
|
||||||
|
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
|
||||||
|
self.act = gelu
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
|
||||||
|
|
||||||
|
def call(self, x, training=False):
|
||||||
|
h = self.act(self.c_fc(x))
|
||||||
|
h2 = self.c_proj(h)
|
||||||
|
h2 = self.dropout(h2, training=training)
|
||||||
|
return h2
|
||||||
|
|
||||||
|
|
||||||
|
class TFBlock(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
||||||
|
super(TFBlock, self).__init__(**kwargs)
|
||||||
|
nx = config.n_embd
|
||||||
|
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
|
||||||
|
self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
|
||||||
|
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
|
||||||
|
self.mlp = TFMLP(4 * nx, config, name='mlp')
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
x, layer_past, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
a = self.ln_1(x)
|
||||||
|
output_attn = self.attn([a, layer_past, attention_mask, head_mask], training=training)
|
||||||
|
a = output_attn[0] # output_attn: a, present, (attentions)
|
||||||
|
x = x + a
|
||||||
|
|
||||||
|
m = self.ln_2(x)
|
||||||
|
m = self.mlp(m, training=training)
|
||||||
|
x = x + m
|
||||||
|
|
||||||
|
outputs = [x] + output_attn[1:]
|
||||||
|
return outputs # x, present, (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFGPT2MainLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.num_hidden_layers = config.n_layer
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
self.n_embd = config.n_embd
|
||||||
|
|
||||||
|
self.wte = TFSharedEmbeddings(config.vocab_size,
|
||||||
|
config.hidden_size,
|
||||||
|
initializer_range=config.initializer_range,
|
||||||
|
name='wte')
|
||||||
|
self.wpe = tf.keras.layers.Embedding(config.n_positions,
|
||||||
|
config.n_embd,
|
||||||
|
embeddings_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='wpe')
|
||||||
|
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
|
||||||
|
self.h = [TFBlock(config.n_ctx,
|
||||||
|
config,
|
||||||
|
scale=True,
|
||||||
|
name='h_._{}'.format(i)) for i in range(config.n_layer)]
|
||||||
|
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
""" Prunes heads of the model.
|
||||||
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
past = inputs[1] if len(inputs) > 1 else past
|
||||||
|
attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
|
||||||
|
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
||||||
|
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
||||||
|
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
past = inputs.get('past', past)
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
if past is None:
|
||||||
|
past_length = 0
|
||||||
|
past = [None] * len(self.h)
|
||||||
|
else:
|
||||||
|
past_length = shape_list(past[0][0])[-2]
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
|
||||||
|
attention_mask = tf.cast(attention_mask, tf.float32)
|
||||||
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||||
|
else:
|
||||||
|
attention_mask = None
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if not head_mask is None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||||
|
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
|
||||||
|
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
|
||||||
|
|
||||||
|
inputs_embeds = self.wte(input_ids, mode='embedding')
|
||||||
|
position_embeds = self.wpe(position_ids)
|
||||||
|
if token_type_ids is not None:
|
||||||
|
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
|
||||||
|
token_type_embeds = self.wte(token_type_ids, mode='embedding')
|
||||||
|
else:
|
||||||
|
token_type_embeds = 0
|
||||||
|
hidden_states = inputs_embeds + position_embeds + token_type_embeds
|
||||||
|
hidden_states = self.drop(hidden_states, training=training)
|
||||||
|
|
||||||
|
output_shape = input_shape + [shape_list(hidden_states)[-1]]
|
||||||
|
|
||||||
|
presents = ()
|
||||||
|
all_attentions = []
|
||||||
|
all_hidden_states = ()
|
||||||
|
for i, (block, layer_past) in enumerate(zip(self.h, past)):
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
|
||||||
|
|
||||||
|
outputs = block([hidden_states, layer_past, attention_mask, head_mask[i]], training=training)
|
||||||
|
|
||||||
|
hidden_states, present = outputs[:2]
|
||||||
|
presents = presents + (present,)
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions.append(outputs[2])
|
||||||
|
|
||||||
|
hidden_states = self.ln_f(hidden_states)
|
||||||
|
|
||||||
|
hidden_states = tf.reshape(hidden_states, output_shape)
|
||||||
|
# Add last hidden state
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states, presents)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
# let the number of heads free (-1) so we can extract attention even after head pruning
|
||||||
|
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
|
||||||
|
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last hidden state, presents, (all hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFGPT2PreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = GPT2Config
|
||||||
|
pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_pt_weights = load_gpt2_pt_weights_in_tf2
|
||||||
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
|
|
||||||
|
GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
|
||||||
|
`Language Models are Unsupervised Multitask Learners`_
|
||||||
|
by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
|
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
|
||||||
|
corpus of ~40 GB of text data.
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`Language Models are Unsupervised Multitask Learners`:
|
||||||
|
https://openai.com/blog/better-language-models/
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
||||||
|
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**past**:
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
|
(see `past` output below). Can be used to speed up sequential decoding.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
|
**position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
|
||||||
|
class TFGPT2Model(TFGPT2PreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the last layer of the model.
|
||||||
|
**past**:
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import GPT2Tokenizer, TFGPT2Model
|
||||||
|
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
|
model = TFGPT2Model.from_pretrained('gpt2')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.transformer(inputs, **kwargs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
|
||||||
|
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
|
||||||
|
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**past**:
|
||||||
|
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
|
||||||
|
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
|
model = TFGPT2LMHeadModel.from_pretrained('gpt2')
|
||||||
|
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
transformer_outputs = self.transformer(inputs, **kwargs)
|
||||||
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
lm_logits = self.transformer.wte(hidden_states, mode="linear")
|
||||||
|
|
||||||
|
outputs = (lm_logits,) + transformer_outputs[1:]
|
||||||
|
|
||||||
|
return outputs # lm_logits, presents, (all hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
|
||||||
|
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||||
|
The language modeling head has its weights tied to the input embeddings,
|
||||||
|
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||||
|
""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
|
||||||
|
class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
|
||||||
|
Index of the classification token in each input sequence.
|
||||||
|
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**lm_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
|
||||||
|
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||||
|
**past**:
|
||||||
|
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
|
||||||
|
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
|
model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
|
||||||
|
|
||||||
|
# Add a [CLS] to the vocabulary (we should train it also!)
|
||||||
|
# This option is currently not implemented in TF 2.0
|
||||||
|
raise NotImplementedError
|
||||||
|
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
|
||||||
|
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
|
||||||
|
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
|
||||||
|
|
||||||
|
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||||
|
encoded_choices = [tokenizer.encode(s) for s in choices]
|
||||||
|
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
|
||||||
|
|
||||||
|
input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
|
||||||
|
mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
|
||||||
|
|
||||||
|
outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||||
|
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||||
|
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
||||||
|
|
||||||
|
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
past = inputs[1] if len(inputs) > 1 else past
|
||||||
|
attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
|
||||||
|
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
||||||
|
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
||||||
|
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
||||||
|
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
|
||||||
|
assert len(inputs) <= 7, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
past = inputs.get('past', past)
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
|
||||||
|
assert len(inputs) <= 7, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
input_shapes = shape_list(input_ids)
|
||||||
|
|
||||||
|
seq_length = input_shapes[-1]
|
||||||
|
|
||||||
|
flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
|
||||||
|
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
||||||
|
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
|
||||||
|
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
|
||||||
|
|
||||||
|
flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
|
||||||
|
|
||||||
|
transformer_outputs = self.transformer(flat_inputs, training=training)
|
||||||
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
|
||||||
|
|
||||||
|
lm_logits = self.transformer.wte(hidden_states, mode="linear")
|
||||||
|
mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
|
||||||
|
|
||||||
|
mc_logits = tf.squeeze(mc_logits, axis=-1)
|
||||||
|
|
||||||
|
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
|
||||||
|
|
||||||
|
return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions)
|
||||||
576
transformers/modeling_tf_openai.py
Normal file
576
transformers/modeling_tf_openai.py
Normal file
@@ -0,0 +1,576 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 OpenAI GPT model."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import collections
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
|
||||||
|
TFSequenceSummary, shape_list, get_initializer)
|
||||||
|
from .configuration_openai import OpenAIGPTConfig
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
|
||||||
|
|
||||||
|
|
||||||
|
def load_openai_gpt_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||||
|
# build the network
|
||||||
|
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
tf_inputs = tf.constant(inputs_list)
|
||||||
|
tfo = tf_model(tf_inputs, training=False)
|
||||||
|
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
def gelu(x):
|
||||||
|
"""Gaussian Error Linear Unit.
|
||||||
|
This is a smoother version of the RELU.
|
||||||
|
Original paper: https://arxiv.org/abs/1606.08415
|
||||||
|
Args:
|
||||||
|
x: float Tensor to perform activation.
|
||||||
|
Returns:
|
||||||
|
`x` with the GELU activation applied.
|
||||||
|
"""
|
||||||
|
cdf = 0.5 * (1.0 + tf.tanh(
|
||||||
|
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
|
||||||
|
return x * cdf
|
||||||
|
|
||||||
|
|
||||||
|
def swish(x):
|
||||||
|
return x * tf.math.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
|
||||||
|
"relu": tf.keras.activations.relu,
|
||||||
|
"swish": tf.keras.layers.Activation(swish)}
|
||||||
|
|
||||||
|
|
||||||
|
class TFAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
|
||||||
|
super(TFAttention, self).__init__(**kwargs)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||||
|
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||||
|
assert n_state % config.n_head == 0
|
||||||
|
self.n_ctx = n_ctx
|
||||||
|
self.n_head = config.n_head
|
||||||
|
self.split_size = n_state
|
||||||
|
self.scale = scale
|
||||||
|
|
||||||
|
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
|
||||||
|
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
|
||||||
|
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
|
||||||
|
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def causal_attention_mask(nd, ns, dtype):
|
||||||
|
"""1's in the lower triangle, counting from the lower right corner.
|
||||||
|
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
|
||||||
|
"""
|
||||||
|
i = tf.range(nd)[:,None]
|
||||||
|
j = tf.range(ns)
|
||||||
|
m = i >= j - ns + nd
|
||||||
|
return tf.cast(m, dtype)
|
||||||
|
|
||||||
|
def _attn(self, inputs, training=False):
|
||||||
|
q, k, v, attention_mask, head_mask = inputs
|
||||||
|
# q, k, v have shape [batch, heads, sequence, features]
|
||||||
|
w = tf.matmul(q, k, transpose_b=True)
|
||||||
|
if self.scale:
|
||||||
|
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
|
||||||
|
w = w / tf.math.sqrt(dk)
|
||||||
|
|
||||||
|
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
||||||
|
_, _, nd, ns = shape_list(w)
|
||||||
|
b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
|
||||||
|
b = tf.reshape(b, [1, 1, nd, ns])
|
||||||
|
w = w * b - 1e4 * (1 - b)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask
|
||||||
|
w = w + attention_mask
|
||||||
|
|
||||||
|
w = tf.nn.softmax(w, axis=-1)
|
||||||
|
w = self.attn_dropout(w, training=training)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
w = w * head_mask
|
||||||
|
|
||||||
|
outputs = [tf.matmul(w, v)]
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs.append(w)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def merge_heads(self, x):
|
||||||
|
x = tf.transpose(x, [0, 2, 1, 3])
|
||||||
|
x_shape = shape_list(x)
|
||||||
|
new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
|
||||||
|
return tf.reshape(x, new_x_shape)
|
||||||
|
|
||||||
|
def split_heads(self, x):
|
||||||
|
x_shape = shape_list(x)
|
||||||
|
new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
|
||||||
|
x = tf.reshape(x, new_x_shape)
|
||||||
|
return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features)
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
x, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
x = self.c_attn(x)
|
||||||
|
query, key, value = tf.split(x, 3, axis=2)
|
||||||
|
query = self.split_heads(query)
|
||||||
|
key = self.split_heads(key)
|
||||||
|
value = self.split_heads(value)
|
||||||
|
|
||||||
|
attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
|
||||||
|
a = attn_outputs[0]
|
||||||
|
|
||||||
|
a = self.merge_heads(a)
|
||||||
|
a = self.c_proj(a)
|
||||||
|
a = self.resid_dropout(a, training=training)
|
||||||
|
|
||||||
|
outputs = [a] + attn_outputs[1:]
|
||||||
|
return outputs # a, (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFMLP(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, n_state, config, **kwargs):
|
||||||
|
super(TFMLP, self).__init__(**kwargs)
|
||||||
|
nx = config.n_embd
|
||||||
|
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
|
||||||
|
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
|
||||||
|
self.act = gelu
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
|
||||||
|
|
||||||
|
def call(self, x, training=False):
|
||||||
|
h = self.act(self.c_fc(x))
|
||||||
|
h2 = self.c_proj(h)
|
||||||
|
h2 = self.dropout(h2, training=training)
|
||||||
|
return h2
|
||||||
|
|
||||||
|
|
||||||
|
class TFBlock(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, n_ctx, config, scale=False, **kwargs):
|
||||||
|
super(TFBlock, self).__init__(**kwargs)
|
||||||
|
nx = config.n_embd
|
||||||
|
self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
|
||||||
|
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
|
||||||
|
self.mlp = TFMLP(4 * nx, config, name='mlp')
|
||||||
|
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
x, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
output_attn = self.attn([x, attention_mask, head_mask], training=training)
|
||||||
|
a = output_attn[0] # output_attn: a, (attentions)
|
||||||
|
|
||||||
|
n = self.ln_1(x + a)
|
||||||
|
m = self.mlp(n, training=training)
|
||||||
|
h = self.ln_2(n + m)
|
||||||
|
|
||||||
|
outputs = [h] + output_attn[1:]
|
||||||
|
return outputs # x, (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFOpenAIGPTMainLayer, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.num_hidden_layers = config.n_layer
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
self.n_embd = config.n_embd
|
||||||
|
|
||||||
|
self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
|
||||||
|
config.n_embd,
|
||||||
|
initializer_range=config.initializer_range,
|
||||||
|
name='tokens_embed')
|
||||||
|
self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
|
||||||
|
config.n_embd,
|
||||||
|
embeddings_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='positions_embed')
|
||||||
|
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
|
||||||
|
self.h = [TFBlock(config.n_ctx,
|
||||||
|
config,
|
||||||
|
scale=True,
|
||||||
|
name='h_._{}'.format(i)) for i in range(config.n_layer)]
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
""" Prunes heads of the model.
|
||||||
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
|
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
||||||
|
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
||||||
|
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
||||||
|
assert len(inputs) <= 5, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
assert len(inputs) <= 5, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.range(shape_list(input_ids)[-1], dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
|
||||||
|
attention_mask = tf.cast(attention_mask, tf.float32)
|
||||||
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||||
|
else:
|
||||||
|
attention_mask = None
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if not head_mask is None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||||
|
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
|
||||||
|
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
|
||||||
|
|
||||||
|
inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
|
||||||
|
position_embeds = self.positions_embed(position_ids)
|
||||||
|
if token_type_ids is not None:
|
||||||
|
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
|
||||||
|
token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
|
||||||
|
else:
|
||||||
|
token_type_embeds = 0
|
||||||
|
hidden_states = inputs_embeds + position_embeds + token_type_embeds
|
||||||
|
hidden_states = self.drop(hidden_states, training=training)
|
||||||
|
|
||||||
|
output_shape = input_shape + [shape_list(hidden_states)[-1]]
|
||||||
|
|
||||||
|
all_attentions = []
|
||||||
|
all_hidden_states = ()
|
||||||
|
for i, block in enumerate(self.h):
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
|
||||||
|
|
||||||
|
outputs = block([hidden_states, attention_mask, head_mask[i]], training=training)
|
||||||
|
hidden_states = outputs[0]
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions.append(outputs[1])
|
||||||
|
|
||||||
|
hidden_states = tf.reshape(hidden_states, output_shape)
|
||||||
|
# Add last hidden state
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
# let the number of heads free (-1) so we can extract attention even after head pruning
|
||||||
|
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
|
||||||
|
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last hidden state, (all hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = OpenAIGPTConfig
|
||||||
|
pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_pt_weights = load_openai_gpt_pt_weights_in_tf2
|
||||||
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
|
|
||||||
|
OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
|
||||||
|
`Improving Language Understanding by Generative Pre-Training`_
|
||||||
|
by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
|
It's a causal (unidirectional) transformer pre-trained using language modeling on a large
|
||||||
|
corpus will long range dependencies, the Toronto Book Corpus.
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`Improving Language Understanding by Generative Pre-Training`:
|
||||||
|
https://openai.com/blog/language-unsupervised/
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
||||||
|
**input_ids**: ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
|
||||||
|
**position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
|
||||||
|
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
|
||||||
|
class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the last layer of the model.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
|
||||||
|
|
||||||
|
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||||
|
model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.transformer(inputs, **kwargs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
|
||||||
|
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
|
||||||
|
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
|
||||||
|
|
||||||
|
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||||
|
model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
transformer_outputs = self.transformer(inputs, **kwargs)
|
||||||
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
|
||||||
|
|
||||||
|
outputs = (lm_logits,) + transformer_outputs[1:]
|
||||||
|
|
||||||
|
return outputs # lm_logits, (all hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
|
||||||
|
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||||
|
The language modeling head has its weights tied to the input embeddings,
|
||||||
|
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||||
|
""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
|
||||||
|
class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
|
||||||
|
Index of the classification token in each input sequence.
|
||||||
|
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
|
||||||
|
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
|
||||||
|
|
||||||
|
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||||
|
model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
|
||||||
|
|
||||||
|
# Add a [CLS] to the vocabulary (we should train it also!)
|
||||||
|
# This option is currently not implemented in TF 2.0
|
||||||
|
raise NotImplementedError
|
||||||
|
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
|
||||||
|
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
|
||||||
|
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
|
||||||
|
|
||||||
|
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||||
|
input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices
|
||||||
|
mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||||
|
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
|
||||||
|
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
||||||
|
|
||||||
|
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
|
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
||||||
|
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
||||||
|
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
||||||
|
mc_token_ids = inputs[5] if len(inputs) > 5 else mc_token_ids
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
input_shapes = shape_list(input_ids)
|
||||||
|
|
||||||
|
seq_length = input_shapes[-1]
|
||||||
|
|
||||||
|
flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
|
||||||
|
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
||||||
|
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
|
||||||
|
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
|
||||||
|
|
||||||
|
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
|
||||||
|
|
||||||
|
transformer_outputs = self.transformer(flat_inputs, training=training)
|
||||||
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
|
||||||
|
|
||||||
|
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
|
||||||
|
mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
|
||||||
|
|
||||||
|
mc_logits = tf.squeeze(mc_logits, axis=-1)
|
||||||
|
|
||||||
|
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
|
||||||
|
|
||||||
|
return outputs # lm logits, mc logits, (all hidden_states), (attentions)
|
||||||
291
transformers/modeling_tf_pytorch_utils.py
Normal file
291
transformers/modeling_tf_pytorch_utils.py
Normal file
@@ -0,0 +1,291 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" PyTorch - TF 2.0 general utilities."""
|
||||||
|
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
|
||||||
|
def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
|
||||||
|
""" Convert a TF 2.0 model variable name in a pytorch model weight name.
|
||||||
|
|
||||||
|
Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
|
||||||
|
- '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
|
||||||
|
- '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
|
||||||
|
|
||||||
|
return tuple with:
|
||||||
|
- pytorch model weight name
|
||||||
|
- transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
|
||||||
|
"""
|
||||||
|
tf_name = tf_name.replace(':0', '') # device ids
|
||||||
|
tf_name = re.sub(r'/[^/]*___([^/]*)/', r'/\1/', tf_name) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
|
||||||
|
tf_name = tf_name.replace('_._', '/') # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
|
||||||
|
tf_name = re.sub(r'//+', '/', tf_name) # Remove empty levels at the end
|
||||||
|
tf_name = tf_name.split('/') # Convert from TF2.0 '/' separators to PyTorch '.' separators
|
||||||
|
tf_name = tf_name[1:] # Remove level zero
|
||||||
|
|
||||||
|
# When should we transpose the weights
|
||||||
|
transpose = bool(tf_name[-1] == 'kernel' or 'emb_projs' in tf_name or 'out_projs' in tf_name)
|
||||||
|
|
||||||
|
# Convert standard TF2.0 names in PyTorch names
|
||||||
|
if tf_name[-1] == 'kernel' or tf_name[-1] == 'embeddings' or tf_name[-1] == 'gamma':
|
||||||
|
tf_name[-1] = 'weight'
|
||||||
|
if tf_name[-1] == 'beta':
|
||||||
|
tf_name[-1] = 'bias'
|
||||||
|
|
||||||
|
# Remove prefix if needed
|
||||||
|
tf_name = '.'.join(tf_name)
|
||||||
|
if start_prefix_to_remove:
|
||||||
|
tf_name = tf_name.replace(start_prefix_to_remove, '', 1)
|
||||||
|
|
||||||
|
return tf_name, transpose
|
||||||
|
|
||||||
|
|
||||||
|
#####################
|
||||||
|
### PyTorch => TF 2.0
|
||||||
|
|
||||||
|
def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
|
||||||
|
""" Load pytorch checkpoints in a TF 2.0 model
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
import torch
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
|
||||||
|
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
pt_path = os.path.abspath(pytorch_checkpoint_path)
|
||||||
|
logger.info("Loading PyTorch weights from {}".format(pt_path))
|
||||||
|
|
||||||
|
pt_state_dict = torch.load(pt_path, map_location='cpu')
|
||||||
|
|
||||||
|
return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
|
||||||
|
|
||||||
|
|
||||||
|
def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
|
||||||
|
""" Load pytorch checkpoints in a TF 2.0 model
|
||||||
|
"""
|
||||||
|
pt_state_dict = pt_model.state_dict()
|
||||||
|
|
||||||
|
return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
|
||||||
|
|
||||||
|
|
||||||
|
def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
|
||||||
|
""" Load pytorch state_dict in a TF 2.0 model.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.python.keras import backend as K
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
|
||||||
|
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
if tf_inputs is None:
|
||||||
|
tf_inputs = tf.constant(DUMMY_INPUTS)
|
||||||
|
|
||||||
|
if tf_inputs is not None:
|
||||||
|
tfo = tf_model(tf_inputs, training=False) # Make sure model is built
|
||||||
|
|
||||||
|
# Adapt state dict - TODO remove this and update the AWS weights files instead
|
||||||
|
# Convert old format to new format if needed from a PyTorch state_dict
|
||||||
|
old_keys = []
|
||||||
|
new_keys = []
|
||||||
|
for key in pt_state_dict.keys():
|
||||||
|
new_key = None
|
||||||
|
if 'gamma' in key:
|
||||||
|
new_key = key.replace('gamma', 'weight')
|
||||||
|
if 'beta' in key:
|
||||||
|
new_key = key.replace('beta', 'bias')
|
||||||
|
if new_key:
|
||||||
|
old_keys.append(key)
|
||||||
|
new_keys.append(new_key)
|
||||||
|
for old_key, new_key in zip(old_keys, new_keys):
|
||||||
|
pt_state_dict[new_key] = pt_state_dict.pop(old_key)
|
||||||
|
|
||||||
|
# Make sure we are able to load PyTorch base models as well as derived models (with heads)
|
||||||
|
# TF models always have a prefix, some of PyTorch models (base ones) don't
|
||||||
|
start_prefix_to_remove = ''
|
||||||
|
if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
|
||||||
|
start_prefix_to_remove = tf_model.base_model_prefix + '.'
|
||||||
|
|
||||||
|
symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
|
||||||
|
|
||||||
|
weight_value_tuples = []
|
||||||
|
all_pytorch_weights = set(list(pt_state_dict.keys()))
|
||||||
|
for symbolic_weight in symbolic_weights:
|
||||||
|
sw_name = symbolic_weight.name
|
||||||
|
name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
|
||||||
|
|
||||||
|
# Find associated numpy array in pytorch model state dict
|
||||||
|
assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
|
||||||
|
array = pt_state_dict[name].numpy()
|
||||||
|
|
||||||
|
if transpose:
|
||||||
|
array = numpy.transpose(array)
|
||||||
|
|
||||||
|
if len(symbolic_weight.shape) < len(array.shape):
|
||||||
|
array = numpy.squeeze(array)
|
||||||
|
elif len(symbolic_weight.shape) > len(array.shape):
|
||||||
|
array = numpy.expand_dims(array, axis=0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert list(symbolic_weight.shape) == list(array.shape)
|
||||||
|
except AssertionError as e:
|
||||||
|
e.args += (symbolic_weight.shape, array.shape)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
logger.info("Initialize TF weight {}".format(symbolic_weight.name))
|
||||||
|
|
||||||
|
weight_value_tuples.append((symbolic_weight, array))
|
||||||
|
all_pytorch_weights.discard(name)
|
||||||
|
|
||||||
|
K.batch_set_value(weight_value_tuples)
|
||||||
|
|
||||||
|
if tf_inputs is not None:
|
||||||
|
tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run
|
||||||
|
|
||||||
|
logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
|
||||||
|
|
||||||
|
return tf_model
|
||||||
|
|
||||||
|
|
||||||
|
#####################
|
||||||
|
### TF 2.0 => PyTorch
|
||||||
|
|
||||||
|
def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
|
||||||
|
""" Load TF 2.0 HDF5 checkpoint in a PyTorch model
|
||||||
|
We use HDF5 to easily do transfer learning
|
||||||
|
(see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
import torch
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
|
||||||
|
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
import transformers
|
||||||
|
|
||||||
|
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||||
|
logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
|
||||||
|
|
||||||
|
# Instantiate and load the associated TF 2.0 model
|
||||||
|
tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beggining
|
||||||
|
tf_model_class = getattr(transformers, tf_model_class_name)
|
||||||
|
tf_model = tf_model_class(pt_model.config)
|
||||||
|
|
||||||
|
if tf_inputs is None:
|
||||||
|
tf_inputs = tf.constant(DUMMY_INPUTS)
|
||||||
|
|
||||||
|
if tf_inputs is not None:
|
||||||
|
tfo = tf_model(tf_inputs, training=False) # Make sure model is built
|
||||||
|
|
||||||
|
tf_model.load_weights(tf_checkpoint_path, by_name=True)
|
||||||
|
|
||||||
|
return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
|
||||||
|
|
||||||
|
def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
|
||||||
|
""" Load TF 2.0 model in a pytorch model
|
||||||
|
"""
|
||||||
|
weights = tf_model.weights
|
||||||
|
|
||||||
|
return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
|
||||||
|
|
||||||
|
|
||||||
|
def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
|
||||||
|
""" Load TF2.0 symbolic weights in a PyTorch model
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import tensorflow as tf
|
||||||
|
import torch
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
|
||||||
|
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
new_pt_params_dict = {}
|
||||||
|
current_pt_params_dict = dict(pt_model.named_parameters())
|
||||||
|
|
||||||
|
# Make sure we are able to load PyTorch base models as well as derived models (with heads)
|
||||||
|
# TF models always have a prefix, some of PyTorch models (base ones) don't
|
||||||
|
start_prefix_to_remove = ''
|
||||||
|
if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
|
||||||
|
start_prefix_to_remove = pt_model.base_model_prefix + '.'
|
||||||
|
|
||||||
|
# Build a map from potential PyTorch weight names to TF 2.0 Variables
|
||||||
|
tf_weights_map = {}
|
||||||
|
for tf_weight in tf_weights:
|
||||||
|
pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(tf_weight.name, start_prefix_to_remove=start_prefix_to_remove)
|
||||||
|
tf_weights_map[pt_name] = (tf_weight.numpy(), transpose)
|
||||||
|
|
||||||
|
all_tf_weights = set(list(tf_weights_map.keys()))
|
||||||
|
loaded_pt_weights_data_ptr = {}
|
||||||
|
for pt_weight_name, pt_weight in current_pt_params_dict.items():
|
||||||
|
# Handle PyTorch shared weight ()not duplicated in TF 2.0
|
||||||
|
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
|
||||||
|
new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find associated numpy array in pytorch model state dict
|
||||||
|
if pt_weight_name not in tf_weights_map:
|
||||||
|
raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
|
||||||
|
|
||||||
|
array, transpose = tf_weights_map[pt_weight_name]
|
||||||
|
|
||||||
|
if transpose:
|
||||||
|
array = numpy.transpose(array)
|
||||||
|
|
||||||
|
if len(pt_weight.shape) < len(array.shape):
|
||||||
|
array = numpy.squeeze(array)
|
||||||
|
elif len(pt_weight.shape) > len(array.shape):
|
||||||
|
array = numpy.expand_dims(array, axis=0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert list(pt_weight.shape) == list(array.shape)
|
||||||
|
except AssertionError as e:
|
||||||
|
e.args += (pt_weight.shape, array.shape)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
|
||||||
|
|
||||||
|
new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
|
||||||
|
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
|
||||||
|
all_tf_weights.discard(pt_weight_name)
|
||||||
|
|
||||||
|
missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
|
||||||
|
|
||||||
|
if len(missing_keys) > 0:
|
||||||
|
logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
|
||||||
|
pt_model.__class__.__name__, missing_keys))
|
||||||
|
if len(unexpected_keys) > 0:
|
||||||
|
logger.info("Weights from TF 2.0 model not used in {}: {}".format(
|
||||||
|
pt_model.__class__.__name__, unexpected_keys))
|
||||||
|
|
||||||
|
logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights))
|
||||||
|
|
||||||
|
return pt_model
|
||||||
382
transformers/modeling_tf_roberta.py
Normal file
382
transformers/modeling_tf_roberta.py
Normal file
@@ -0,0 +1,382 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 RoBERTa model. """
|
||||||
|
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .configuration_roberta import RobertaConfig
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||||
|
|
||||||
|
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
|
||||||
|
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
|
||||||
|
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_roberta_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||||
|
# build the network
|
||||||
|
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
tf_inputs = tf.constant(inputs_list)
|
||||||
|
tfo = tf_model(tf_inputs, training=False)
|
||||||
|
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
class TFRobertaEmbeddings(TFBertEmbeddings):
|
||||||
|
"""
|
||||||
|
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
|
||||||
|
"""
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFRobertaEmbeddings, self).__init__(config, **kwargs)
|
||||||
|
self.padding_idx = 1
|
||||||
|
|
||||||
|
def _embedding(self, inputs, training=False):
|
||||||
|
"""Applies embedding based on inputs tensor."""
|
||||||
|
input_ids, position_ids, token_type_ids = inputs
|
||||||
|
|
||||||
|
seq_length = tf.shape(input_ids)[1]
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|
||||||
|
return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training)
|
||||||
|
|
||||||
|
|
||||||
|
class TFRobertaMainLayer(TFBertMainLayer):
|
||||||
|
"""
|
||||||
|
Same as TFBertMainLayer but uses TFRobertaEmbeddings.
|
||||||
|
"""
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFRobertaMainLayer, self).__init__(config, **kwargs)
|
||||||
|
self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
# Check that input_ids starts with control token
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
|
||||||
|
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
|
||||||
|
"This model requires special tokens in order to work. "
|
||||||
|
"Please specify add_special_tokens=True in your encoding.")
|
||||||
|
|
||||||
|
return super(TFRobertaMainLayer, self).call(inputs, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class TFRobertaPreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = RobertaConfig
|
||||||
|
pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_pt_weights = load_roberta_pt_weights_in_tf2
|
||||||
|
base_model_prefix = "roberta"
|
||||||
|
|
||||||
|
|
||||||
|
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
||||||
|
`RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
|
||||||
|
by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
|
||||||
|
Veselin Stoyanov. It is based on Google's BERT model released in 2018.
|
||||||
|
|
||||||
|
It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
|
||||||
|
objective and training with much larger mini-batches and learning rates.
|
||||||
|
|
||||||
|
This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained
|
||||||
|
models.
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
|
||||||
|
https://arxiv.org/abs/1907.11692
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
|
||||||
|
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ROBERTA_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: <s> the dog is hairy . </s>``
|
||||||
|
|
||||||
|
Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
|
||||||
|
the ``add_special_tokens`` parameter set to ``True``.
|
||||||
|
|
||||||
|
RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional` need to be trained) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Optional segment token indices to indicate first and second portions of the inputs.
|
||||||
|
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
|
||||||
|
during finetuning.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
|
||||||
|
class TFRobertaModel(TFRobertaPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
|
||||||
|
Last layer hidden-state of the first token of the sequence (classification token)
|
||||||
|
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||||
|
layer weights are trained from the next sentence prediction (classification)
|
||||||
|
objective during Bert pretraining. This output is usually *not* a good summary
|
||||||
|
of the semantic content of the input, you're often better with averaging or pooling
|
||||||
|
the sequence of hidden-states for the whole input sequence.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import RobertaTokenizer, TFRobertaModel
|
||||||
|
|
||||||
|
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
|
||||||
|
model = TFRobertaModel.from_pretrained('roberta-base')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFRobertaModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.roberta = TFRobertaMainLayer(config, name='roberta')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.roberta(inputs, **kwargs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFRobertaLMHead(tf.keras.layers.Layer):
|
||||||
|
"""Roberta Head for masked language modeling."""
|
||||||
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
|
super(TFRobertaLMHead, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='dense')
|
||||||
|
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
|
||||||
|
self.act = tf.keras.layers.Activation(gelu)
|
||||||
|
|
||||||
|
# The output weights are the same as the input embeddings, but there is
|
||||||
|
# an output-only bias for each token.
|
||||||
|
self.decoder = input_embeddings
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
self.bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='bias')
|
||||||
|
super(TFRobertaLMHead, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, features):
|
||||||
|
x = self.dense(features)
|
||||||
|
x = self.act(x)
|
||||||
|
x = self.layer_norm(x)
|
||||||
|
|
||||||
|
# project back to size of vocabulary with bias
|
||||||
|
x = self.decoder(x, mode="linear") + self.bias
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
|
||||||
|
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
|
||||||
|
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**masked_lm_labels**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the masked language modeling loss.
|
||||||
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
|
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``tf.Tensor`` of shape ``(1,)``:
|
||||||
|
Masked language modeling loss.
|
||||||
|
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import RobertaTokenizer, TFRobertaForMaskedLM
|
||||||
|
|
||||||
|
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
|
||||||
|
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||||
|
prediction_scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
|
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
||||||
|
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.roberta(inputs, **kwargs)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
prediction_scores = self.lm_head(sequence_output)
|
||||||
|
|
||||||
|
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||||
|
|
||||||
|
return outputs # prediction_scores, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFRobertaClassificationHead(tf.keras.layers.Layer):
|
||||||
|
"""Head for sentence-level classification tasks."""
|
||||||
|
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
|
||||||
|
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
activation='tanh',
|
||||||
|
name="dense")
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
self.out_proj = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name="out_proj")
|
||||||
|
|
||||||
|
def call(self, features, training=False):
|
||||||
|
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
|
||||||
|
x = self.dropout(x, training=training)
|
||||||
|
x = self.dense(x)
|
||||||
|
x = self.dropout(x, training=training)
|
||||||
|
x = self.out_proj(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
|
||||||
|
on top of the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
|
||||||
|
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
|
||||||
|
|
||||||
|
tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
|
||||||
|
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
labels = tf.constant([1])[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.roberta = TFRobertaMainLayer(config, name="roberta")
|
||||||
|
self.classifier = TFRobertaClassificationHead(config, name="classifier")
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.roberta(inputs, **kwargs)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
logits = self.classifier(sequence_output, training=kwargs.get('training', False))
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:]
|
||||||
|
|
||||||
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
763
transformers/modeling_tf_transfo_xl.py
Normal file
763
transformers/modeling_tf_transfo_xl.py
Normal file
@@ -0,0 +1,763 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 Transformer XL model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import logging
|
||||||
|
import collections
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .configuration_transfo_xl import TransfoXLConfig
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
|
||||||
|
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_transfo_xl_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
|
||||||
|
# build the network
|
||||||
|
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
tf_inputs = tf.constant(inputs_list)
|
||||||
|
tfo = tf_model(tf_inputs, training=False)
|
||||||
|
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
class TFPositionalEmbedding(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, demb, **kwargs):
|
||||||
|
super(TFPositionalEmbedding, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
|
||||||
|
|
||||||
|
def call(self, pos_seq, bsz=None):
|
||||||
|
sinusoid_inp = tf.einsum('i,j->ij', pos_seq, self.inv_freq)
|
||||||
|
pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
|
||||||
|
|
||||||
|
if bsz is not None:
|
||||||
|
return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
|
||||||
|
else:
|
||||||
|
return pos_emb[:, None, :]
|
||||||
|
|
||||||
|
|
||||||
|
class TFPositionwiseFF(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
|
||||||
|
super(TFPositionwiseFF, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.d_model = d_model
|
||||||
|
self.d_inner = d_inner
|
||||||
|
self.dropout = dropout
|
||||||
|
|
||||||
|
self.layer_1 = tf.keras.layers.Dense(d_inner,
|
||||||
|
kernel_initializer=get_initializer(init_std),
|
||||||
|
activation=tf.nn.relu,
|
||||||
|
name='CoreNet_._0')
|
||||||
|
self.drop_1 = tf.keras.layers.Dropout(dropout)
|
||||||
|
self.layer_2 = tf.keras.layers.Dense(d_model,
|
||||||
|
kernel_initializer=get_initializer(init_std),
|
||||||
|
name='CoreNet_._3')
|
||||||
|
self.drop_2 = tf.keras.layers.Dropout(dropout)
|
||||||
|
|
||||||
|
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
|
||||||
|
|
||||||
|
self.pre_lnorm = pre_lnorm
|
||||||
|
|
||||||
|
def call(self, inp, training=False):
|
||||||
|
if self.pre_lnorm:
|
||||||
|
##### layer normalization + positionwise feed-forward
|
||||||
|
core_out = self.layer_norm(inp)
|
||||||
|
core_out = self.layer_1(core_out)
|
||||||
|
core_out = self.drop_1(core_out, training=training)
|
||||||
|
core_out = self.layer_2(core_out)
|
||||||
|
core_out = self.drop_2(core_out, training=training)
|
||||||
|
|
||||||
|
##### residual connection
|
||||||
|
output = core_out + inp
|
||||||
|
else:
|
||||||
|
##### positionwise feed-forward
|
||||||
|
core_out = self.layer_1(inp)
|
||||||
|
core_out = self.drop_1(core_out, training=training)
|
||||||
|
core_out = self.layer_2(core_out)
|
||||||
|
core_out = self.drop_2(core_out, training=training)
|
||||||
|
|
||||||
|
##### residual connection + layer normalization
|
||||||
|
output = self.layer_norm(inp + core_out)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
|
||||||
|
tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
|
||||||
|
r_r_bias=None, r_w_bias=None, output_attentions=False,
|
||||||
|
layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
|
||||||
|
super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.output_attentions = output_attentions
|
||||||
|
self.n_head = n_head
|
||||||
|
self.d_model = d_model
|
||||||
|
self.d_head = d_head
|
||||||
|
self.dropout = dropout
|
||||||
|
|
||||||
|
self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head,
|
||||||
|
kernel_initializer=get_initializer(init_std),
|
||||||
|
use_bias=False,
|
||||||
|
name='qkv_net')
|
||||||
|
|
||||||
|
self.drop = tf.keras.layers.Dropout(dropout)
|
||||||
|
self.dropatt = tf.keras.layers.Dropout(dropatt)
|
||||||
|
self.o_net = tf.keras.layers.Dense(d_model,
|
||||||
|
kernel_initializer=get_initializer(init_std),
|
||||||
|
use_bias=False,
|
||||||
|
name='o_net')
|
||||||
|
|
||||||
|
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
|
||||||
|
|
||||||
|
self.scale = 1 / (d_head ** 0.5)
|
||||||
|
|
||||||
|
self.pre_lnorm = pre_lnorm
|
||||||
|
|
||||||
|
if r_r_bias is not None and r_w_bias is not None: # Biases are shared
|
||||||
|
self.r_r_bias = r_r_bias
|
||||||
|
self.r_w_bias = r_w_bias
|
||||||
|
else:
|
||||||
|
self.r_r_bias = None
|
||||||
|
self.r_w_bias = None
|
||||||
|
|
||||||
|
self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head,
|
||||||
|
kernel_initializer=get_initializer(init_std),
|
||||||
|
use_bias=False,
|
||||||
|
name='r_net')
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared
|
||||||
|
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='r_r_bias')
|
||||||
|
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='r_w_bias')
|
||||||
|
super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
|
||||||
|
|
||||||
|
def _rel_shift(self, x):
|
||||||
|
x_size = shape_list(x)
|
||||||
|
|
||||||
|
x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
|
||||||
|
x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
|
||||||
|
x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
|
||||||
|
x = tf.reshape(x, x_size)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
w, r, attn_mask, mems, head_mask = inputs
|
||||||
|
qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1]
|
||||||
|
|
||||||
|
if mems is not None:
|
||||||
|
cat = tf.concat([mems, w], 0)
|
||||||
|
if self.pre_lnorm:
|
||||||
|
w_heads = self.qkv_net(self.layer_norm(cat))
|
||||||
|
else:
|
||||||
|
w_heads = self.qkv_net(cat)
|
||||||
|
r_head_k = self.r_net(r)
|
||||||
|
|
||||||
|
w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
|
||||||
|
w_head_q = w_head_q[-qlen:]
|
||||||
|
else:
|
||||||
|
if self.pre_lnorm:
|
||||||
|
w_heads = self.qkv_net(self.layer_norm(w))
|
||||||
|
else:
|
||||||
|
w_heads = self.qkv_net(w)
|
||||||
|
r_head_k = self.r_net(r)
|
||||||
|
|
||||||
|
w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
|
||||||
|
|
||||||
|
klen = shape_list(w_head_k)[0]
|
||||||
|
|
||||||
|
w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head
|
||||||
|
w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head
|
||||||
|
w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head
|
||||||
|
|
||||||
|
r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head
|
||||||
|
|
||||||
|
#### compute attention score
|
||||||
|
rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head
|
||||||
|
AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k) # qlen x klen x bsz x n_head
|
||||||
|
|
||||||
|
rr_head_q = w_head_q + self.r_r_bias
|
||||||
|
BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k) # qlen x klen x bsz x n_head
|
||||||
|
BD = self._rel_shift(BD)
|
||||||
|
|
||||||
|
# [qlen x klen x bsz x n_head]
|
||||||
|
attn_score = AC + BD
|
||||||
|
attn_score = attn_score * self.scale
|
||||||
|
|
||||||
|
#### compute attention probability
|
||||||
|
if attn_mask is not None:
|
||||||
|
attn_mask_t = attn_mask[:, :, None, None]
|
||||||
|
attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
|
||||||
|
|
||||||
|
# [qlen x klen x bsz x n_head]
|
||||||
|
attn_prob = tf.nn.softmax(attn_score, axis=1)
|
||||||
|
attn_prob = self.dropatt(attn_prob, training=training)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
attn_prob = attn_prob * head_mask
|
||||||
|
|
||||||
|
#### compute attention vector
|
||||||
|
attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
|
||||||
|
|
||||||
|
# [qlen x bsz x n_head x d_head]
|
||||||
|
attn_vec_sizes = shape_list(attn_vec)
|
||||||
|
attn_vec = tf.reshape(attn_vec,
|
||||||
|
(attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
|
||||||
|
|
||||||
|
##### linear projection
|
||||||
|
attn_out = self.o_net(attn_vec)
|
||||||
|
attn_out = self.drop(attn_out, training=training)
|
||||||
|
|
||||||
|
if self.pre_lnorm:
|
||||||
|
##### residual connection
|
||||||
|
outputs = [w + attn_out]
|
||||||
|
else:
|
||||||
|
##### residual connection + layer normalization
|
||||||
|
outputs = [self.layer_norm(w + attn_out)]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs.append(attn_prob)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, n_head, d_model, d_head, d_inner, dropout,
|
||||||
|
tgt_len=None, ext_len=None, mem_len=None,
|
||||||
|
dropatt=0., pre_lnorm=False,
|
||||||
|
r_w_bias=None,
|
||||||
|
r_r_bias=None,
|
||||||
|
output_attentions=False,
|
||||||
|
layer_norm_epsilon=1e-5,
|
||||||
|
init_std=0.02,
|
||||||
|
**kwargs):
|
||||||
|
super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model,
|
||||||
|
d_head, dropout, tgt_len=tgt_len, ext_len=ext_len,
|
||||||
|
mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm,
|
||||||
|
r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
layer_norm_epsilon=layer_norm_epsilon, name='dec_attn')
|
||||||
|
self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout,
|
||||||
|
pre_lnorm=pre_lnorm, init_std=init_std,
|
||||||
|
layer_norm_epsilon=layer_norm_epsilon,
|
||||||
|
name='pos_ff')
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
dec_inp, r, dec_attn_mask, mems, head_mask = inputs
|
||||||
|
attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask,
|
||||||
|
mems, head_mask], training=training)
|
||||||
|
ff_output = self.pos_ff(attn_outputs[0], training=training)
|
||||||
|
|
||||||
|
outputs = [ff_output] + attn_outputs[1:]
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAdaptiveEmbedding(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02,
|
||||||
|
sample_softmax=False, **kwargs):
|
||||||
|
super(TFAdaptiveEmbedding, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.n_token = n_token
|
||||||
|
self.d_embed = d_embed
|
||||||
|
self.init_std = init_std
|
||||||
|
|
||||||
|
self.cutoffs = cutoffs + [n_token]
|
||||||
|
self.div_val = div_val
|
||||||
|
self.d_proj = d_proj
|
||||||
|
|
||||||
|
self.emb_scale = d_proj ** 0.5
|
||||||
|
|
||||||
|
self.cutoff_ends = [0] + self.cutoffs
|
||||||
|
|
||||||
|
self.emb_layers = []
|
||||||
|
self.emb_projs = []
|
||||||
|
if div_val == 1:
|
||||||
|
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
|
||||||
|
else:
|
||||||
|
for i in range(len(self.cutoffs)):
|
||||||
|
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
|
||||||
|
d_emb_i = d_embed // (div_val ** i)
|
||||||
|
self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx,
|
||||||
|
d_emb_i,
|
||||||
|
embeddings_initializer=get_initializer(init_std),
|
||||||
|
name='emb_layers_._{}'.format(i)))
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
for i in range(len(self.cutoffs)):
|
||||||
|
d_emb_i = self.d_embed // (self.div_val ** i)
|
||||||
|
self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj),
|
||||||
|
initializer=get_initializer(self.init_std),
|
||||||
|
trainable=True,
|
||||||
|
name='emb_projs_._{}'.format(i)))
|
||||||
|
super(TFAdaptiveEmbedding, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, inp):
|
||||||
|
if self.div_val == 1:
|
||||||
|
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
|
||||||
|
else:
|
||||||
|
inp_flat = tf.reshape(inp, (-1,))
|
||||||
|
emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
|
||||||
|
for i in range(len(self.cutoffs)):
|
||||||
|
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
|
||||||
|
|
||||||
|
mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
|
||||||
|
|
||||||
|
inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
|
||||||
|
emb_i = self.emb_layers[i](inp_i)
|
||||||
|
emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
|
||||||
|
|
||||||
|
mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
|
||||||
|
emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64))
|
||||||
|
|
||||||
|
embed_shape = shape_list(inp) + [self.d_proj]
|
||||||
|
embed = tf.reshape(emb_flat, embed_shape)
|
||||||
|
|
||||||
|
embed *= self.emb_scale
|
||||||
|
|
||||||
|
return embed
|
||||||
|
|
||||||
|
|
||||||
|
class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFTransfoXLMainLayer, self).__init__(**kwargs)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
|
self.n_token = config.n_token
|
||||||
|
|
||||||
|
self.d_embed = config.d_embed
|
||||||
|
self.d_model = config.d_model
|
||||||
|
self.n_head = config.n_head
|
||||||
|
self.d_head = config.d_head
|
||||||
|
self.untie_r = config.untie_r
|
||||||
|
|
||||||
|
self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
||||||
|
div_val=config.div_val, init_std=config.init_std, name='word_emb')
|
||||||
|
|
||||||
|
self.drop = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
|
||||||
|
self.n_layer = config.n_layer
|
||||||
|
|
||||||
|
self.tgt_len = config.tgt_len
|
||||||
|
self.mem_len = config.mem_len
|
||||||
|
self.ext_len = config.ext_len
|
||||||
|
self.max_klen = config.tgt_len + config.ext_len + config.mem_len
|
||||||
|
|
||||||
|
self.attn_type = config.attn_type
|
||||||
|
|
||||||
|
self.layers = []
|
||||||
|
if config.attn_type == 0: # the default attention
|
||||||
|
for i in range(config.n_layer):
|
||||||
|
self.layers.append(
|
||||||
|
TFRelPartialLearnableDecoderLayer(
|
||||||
|
config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
|
||||||
|
tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
|
||||||
|
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
|
||||||
|
r_w_bias=None if self.untie_r else self.r_w_bias,
|
||||||
|
r_r_bias=None if self.untie_r else self.r_r_bias,
|
||||||
|
output_attentions=self.output_attentions,
|
||||||
|
layer_norm_epsilon=config.layer_norm_epsilon,
|
||||||
|
init_std=config.init_std,
|
||||||
|
name='layers_._{}'.format(i))
|
||||||
|
)
|
||||||
|
else: # learnable embeddings and absolute embeddings
|
||||||
|
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
|
||||||
|
|
||||||
|
self.same_length = config.same_length
|
||||||
|
self.clamp_len = config.clamp_len
|
||||||
|
|
||||||
|
if self.attn_type == 0: # default attention
|
||||||
|
self.pos_emb = TFPositionalEmbedding(self.d_model, name='pos_emb')
|
||||||
|
else: # learnable embeddings and absolute embeddings
|
||||||
|
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
if not self.untie_r:
|
||||||
|
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='r_w_bias')
|
||||||
|
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='r_r_bias')
|
||||||
|
super(TFTransfoXLMainLayer, self).build(input_shape)
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
return self.word_emb
|
||||||
|
|
||||||
|
def backward_compatible(self):
|
||||||
|
self.sample_softmax = -1
|
||||||
|
|
||||||
|
def reset_length(self, tgt_len, ext_len, mem_len):
|
||||||
|
self.tgt_len = tgt_len
|
||||||
|
self.mem_len = mem_len
|
||||||
|
self.ext_len = ext_len
|
||||||
|
|
||||||
|
def _prune_heads(self, heads):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def init_mems(self, data):
|
||||||
|
if self.mem_len > 0:
|
||||||
|
mems = []
|
||||||
|
for i in range(self.n_layer):
|
||||||
|
empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model])
|
||||||
|
mems.append(empty)
|
||||||
|
|
||||||
|
return mems
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _update_mems(self, hids, mems, qlen, mlen):
|
||||||
|
# does not deal with None
|
||||||
|
if mems is None: return None
|
||||||
|
|
||||||
|
# mems is not None
|
||||||
|
assert len(hids) == len(mems), 'len(hids) != len(mems)'
|
||||||
|
|
||||||
|
# There are `mlen + qlen` steps that can be cached into mems
|
||||||
|
# For the next step, the last `ext_len` of the `qlen` tokens
|
||||||
|
# will be used as the extended context. Hence, we only cache
|
||||||
|
# the tokens from `mlen + qlen - self.ext_len - self.mem_len`
|
||||||
|
# to `mlen + qlen - self.ext_len`.
|
||||||
|
new_mems = []
|
||||||
|
end_idx = mlen + max(0, qlen - 0 - self.ext_len)
|
||||||
|
beg_idx = max(0, end_idx - self.mem_len)
|
||||||
|
for i in range(len(hids)):
|
||||||
|
|
||||||
|
cat = tf.concat([mems[i], hids[i]], axis=0)
|
||||||
|
tf.stop_gradient(cat)
|
||||||
|
new_mems.append(cat[beg_idx:end_idx])
|
||||||
|
|
||||||
|
return new_mems
|
||||||
|
|
||||||
|
def call(self, inputs, mems=None, head_mask=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
mems = inputs[1] if len(inputs) > 1 else mems
|
||||||
|
head_mask = inputs[2] if len(inputs) > 2 else head_mask
|
||||||
|
assert len(inputs) <= 3, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
mems = inputs.get('mems', mems)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
assert len(inputs) <= 3, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
|
||||||
|
# so we transpose here from shape [bsz, len] to shape [len, bsz]
|
||||||
|
input_ids = tf.transpose(input_ids, perm=(1, 0))
|
||||||
|
|
||||||
|
if mems is None:
|
||||||
|
mems = self.init_mems(input_ids)
|
||||||
|
|
||||||
|
qlen, bsz = shape_list(input_ids)
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
|
||||||
|
if not head_mask is None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.n_layer
|
||||||
|
|
||||||
|
word_emb = self.word_emb(input_ids)
|
||||||
|
|
||||||
|
mlen = shape_list(mems[0])[0] if mems is not None else 0
|
||||||
|
klen = mlen + qlen
|
||||||
|
|
||||||
|
attn_mask = tf.ones([qlen, qlen])
|
||||||
|
mask_u = tf.linalg.band_part(attn_mask, 0, -1)
|
||||||
|
mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
|
||||||
|
attn_mask_pad = tf.zeros([qlen, mlen])
|
||||||
|
dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
|
||||||
|
if self.same_length:
|
||||||
|
mask_l = tf.linalg.band_part(attn_mask, -1, 0)
|
||||||
|
dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia,
|
||||||
|
dec_attn_mask[:, qlen:]], 1)
|
||||||
|
# ::: PyTorch masking code for reference :::
|
||||||
|
# if self.same_length:
|
||||||
|
# all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
|
||||||
|
# mask_len = klen - self.mem_len
|
||||||
|
# if mask_len > 0:
|
||||||
|
# mask_shift_len = qlen - mask_len
|
||||||
|
# else:
|
||||||
|
# mask_shift_len = qlen
|
||||||
|
# dec_attn_mask = (torch.triu(all_ones, 1+mlen)
|
||||||
|
# + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
|
||||||
|
# else:
|
||||||
|
# dec_attn_mask = torch.triu(
|
||||||
|
# word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
|
||||||
|
|
||||||
|
hids = []
|
||||||
|
attentions = []
|
||||||
|
if self.attn_type == 0: # default
|
||||||
|
pos_seq = tf.range(klen-1, -1, -1.0)
|
||||||
|
if self.clamp_len > 0:
|
||||||
|
pos_seq = tf.minimum(pos_seq, self.clamp_len)
|
||||||
|
pos_emb = self.pos_emb(pos_seq)
|
||||||
|
|
||||||
|
core_out = self.drop(word_emb, training=training)
|
||||||
|
pos_emb = self.drop(pos_emb, training=training)
|
||||||
|
|
||||||
|
for i, layer in enumerate(self.layers):
|
||||||
|
hids.append(core_out)
|
||||||
|
mems_i = None if mems is None else mems[i]
|
||||||
|
layer_outputs = layer([core_out, pos_emb, dec_attn_mask,
|
||||||
|
mems_i, head_mask[i]], training=training)
|
||||||
|
core_out = layer_outputs[0]
|
||||||
|
if self.output_attentions:
|
||||||
|
attentions.append(layer_outputs[1])
|
||||||
|
else: # learnable embeddings and absolute embeddings
|
||||||
|
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
|
||||||
|
|
||||||
|
core_out = self.drop(core_out, training=training)
|
||||||
|
|
||||||
|
new_mems = self._update_mems(hids, mems, mlen, qlen)
|
||||||
|
|
||||||
|
# We transpose back here to shape [bsz, len, hidden_dim]
|
||||||
|
outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems]
|
||||||
|
if self.output_hidden_states:
|
||||||
|
# Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
|
||||||
|
hids.append(core_out)
|
||||||
|
hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
|
||||||
|
outputs.append(hids)
|
||||||
|
if self.output_attentions:
|
||||||
|
# Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
|
||||||
|
attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
|
||||||
|
outputs.append(attentions)
|
||||||
|
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = TransfoXLConfig
|
||||||
|
pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_pt_weights = load_transfo_xl_pt_weights_in_tf2
|
||||||
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
|
|
||||||
|
TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
|
||||||
|
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
|
||||||
|
by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
|
It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
|
||||||
|
previously computed hidden-states to attend to longer context (memory).
|
||||||
|
This model also uses adaptive softmax inputs and outputs (tied).
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
|
||||||
|
https://arxiv.org/abs/1901.02860
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
TRANSFO_XL_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
|
||||||
|
the right or on the left.
|
||||||
|
Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**mems**: (`optional`)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
|
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
|
||||||
|
class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the last layer of the model.
|
||||||
|
**mems**:
|
||||||
|
list of ``tf.Tensor`` (one for each layer):
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
|
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import TransfoXLTokenizer, TFTransfoXLModel
|
||||||
|
|
||||||
|
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
|
||||||
|
model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states, mems = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.transformer = TFTransfoXLMainLayer(config, name='transformer')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.transformer(inputs, **kwargs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
|
||||||
|
(adaptive softmax with weights tied to the adaptive input embeddings)""",
|
||||||
|
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
|
||||||
|
class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: ``None`` if ``lm_labels`` is provided else ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
We don't output them when the loss is computed to speedup adaptive softmax decoding.
|
||||||
|
**mems**:
|
||||||
|
list of ``tf.Tensor`` (one for each layer):
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
|
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
|
||||||
|
|
||||||
|
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
|
||||||
|
model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
prediction_scores, mems = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(TFTransfoXLLMHeadModel, self).__init__(config)
|
||||||
|
self.transformer = TFTransfoXLMainLayer(config, name='transformer')
|
||||||
|
self.sample_softmax = config.sample_softmax
|
||||||
|
# use sampled softmax
|
||||||
|
if config.sample_softmax > 0:
|
||||||
|
raise NotImplementedError
|
||||||
|
# use adaptive softmax (including standard softmax)
|
||||||
|
else:
|
||||||
|
self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model,
|
||||||
|
config.cutoffs, div_val=config.div_val, name='crit')
|
||||||
|
|
||||||
|
def reset_length(self, tgt_len, ext_len, mem_len):
|
||||||
|
self.transformer.reset_length(tgt_len, ext_len, mem_len)
|
||||||
|
|
||||||
|
def init_mems(self, data):
|
||||||
|
return self.transformer.init_mems(data)
|
||||||
|
|
||||||
|
def call(self, inputs, mems=None, head_mask=None, labels=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
mems = inputs[1] if len(inputs) > 1 else mems
|
||||||
|
head_mask = inputs[2] if len(inputs) > 2 else head_mask
|
||||||
|
labels = inputs[3] if len(inputs) > 3 else labels
|
||||||
|
assert len(inputs) <= 4, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
mems = inputs.get('mems', mems)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
labels = inputs.get('labels', labels)
|
||||||
|
assert len(inputs) <= 4, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
bsz, tgt_len = shape_list(input_ids)[:2]
|
||||||
|
|
||||||
|
transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training)
|
||||||
|
|
||||||
|
last_hidden = transformer_outputs[0]
|
||||||
|
pred_hid = last_hidden[:, -tgt_len:]
|
||||||
|
outputs = transformer_outputs[1:]
|
||||||
|
if self.sample_softmax > 0 and training:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
# pred_hid = tf.reshape(pred_hid, (-1, shape_list(pred_hid)[-1]))
|
||||||
|
softmax_output = self.crit([pred_hid, labels], training=training)
|
||||||
|
# softmax_output = tf.reshape(softmax_output, (bsz, tgt_len, -1))
|
||||||
|
outputs = [softmax_output] + outputs
|
||||||
|
|
||||||
|
return outputs # logits, new_mems, (all hidden states), (all attentions)
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user