Compare commits
156 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7bd11dda6f | ||
|
|
c3248cf122 | ||
|
|
f2ac50cb55 | ||
|
|
4cbdc7d910 | ||
|
|
dd2add9f6e | ||
|
|
df160af736 | ||
|
|
5b7b78e088 | ||
|
|
866d73ca26 | ||
|
|
d461472948 | ||
|
|
f24a228a93 | ||
|
|
c8ed1c82c8 | ||
|
|
5a5c4349e8 | ||
|
|
7296f1010b | ||
|
|
5d67aa21ae | ||
|
|
fe92755b99 | ||
|
|
fbf5455a86 | ||
|
|
90df44f0aa | ||
|
|
707f9e9241 | ||
|
|
137e20a846 | ||
|
|
d5712f7cac | ||
|
|
9c58b236ef | ||
|
|
413f41921b | ||
|
|
386a93f0f8 | ||
|
|
2d103546ef | ||
|
|
1748fdf657 | ||
|
|
36fc52a3b4 | ||
|
|
371c5ddfad | ||
|
|
5505cf7014 | ||
|
|
9cb97c0c0f | ||
|
|
95854c4a2f | ||
|
|
d2100428d3 | ||
|
|
597ba7feb3 | ||
|
|
6a43dc9d7d | ||
|
|
a09da4eeb0 | ||
|
|
57b5cb3eaa | ||
|
|
c03c0dfd23 | ||
|
|
4f15e5a267 | ||
|
|
18e1f751f1 | ||
|
|
31e5b5ff22 | ||
|
|
3d57c51111 | ||
|
|
c999a3e505 | ||
|
|
030faccb8d | ||
|
|
29570db25b | ||
|
|
2e2f9fed55 | ||
|
|
4c12860f7a | ||
|
|
51ae203290 | ||
|
|
58d75aa310 | ||
|
|
6a73382706 | ||
|
|
dc4e9e5cb3 | ||
|
|
e6cff60b4c | ||
|
|
4b82c485de | ||
|
|
e57d00ee10 | ||
|
|
ecabbf6d28 | ||
|
|
1d18930462 | ||
|
|
f7eba09007 | ||
|
|
2a64107e44 | ||
|
|
c0707a85d2 | ||
|
|
ade3cdf5ad | ||
|
|
076602bdc4 | ||
|
|
5909f71028 | ||
|
|
a1994a71ee | ||
|
|
3a9a9f7861 | ||
|
|
693606a75c | ||
|
|
c0443df593 | ||
|
|
2403a66598 | ||
|
|
4d18199902 | ||
|
|
9f75565ea8 | ||
|
|
4735c2af07 | ||
|
|
ba089c780b | ||
|
|
9660ba1cbd | ||
|
|
1c71ecc880 | ||
|
|
07f4cd73f6 | ||
|
|
5c877fe94a | ||
|
|
79526f82f5 | ||
|
|
9626e0458c | ||
|
|
2d73591a18 | ||
|
|
0eb973b0d9 | ||
|
|
a03fcf570d | ||
|
|
f71b1bb05a | ||
|
|
2a4ef098d6 | ||
|
|
00c4e39581 | ||
|
|
3520be7824 | ||
|
|
0cb163865a | ||
|
|
2670b0d682 | ||
|
|
35401fe50f | ||
|
|
e4679cddce | ||
|
|
1d87b37d10 | ||
|
|
4cb9b60558 | ||
|
|
5482822a2b | ||
|
|
fc1bb1f867 | ||
|
|
21451ec6ba | ||
|
|
f230d91b43 | ||
|
|
d0383e4daf | ||
|
|
e9217da5ff | ||
|
|
9ecd83dace | ||
|
|
35ff345fc9 | ||
|
|
552c44a9b1 | ||
|
|
ee53de7aac | ||
|
|
f8fb4335c9 | ||
|
|
bebaa14039 | ||
|
|
18fb93530b | ||
|
|
2d5d86e037 | ||
|
|
af077b15e2 | ||
|
|
3268ebd229 | ||
|
|
6c5297a423 | ||
|
|
9200a759d7 | ||
|
|
1f179f095f | ||
|
|
1eaf44e713 | ||
|
|
71e4693f08 | ||
|
|
f9f395b21c | ||
|
|
75a97af6bc | ||
|
|
8b388827b5 | ||
|
|
d425a4d60b | ||
|
|
1eb89ddf73 | ||
|
|
7f998b1b83 | ||
|
|
fb0d2f1da1 | ||
|
|
3ba417e1a8 | ||
|
|
ce158a076f | ||
|
|
7a03519975 | ||
|
|
96fa9a8a70 | ||
|
|
33508ae310 | ||
|
|
f7e4a7cdfa | ||
|
|
a7ca6d738b | ||
|
|
cca75e7884 | ||
|
|
bf119c0568 | ||
|
|
ff98b041da | ||
|
|
9ddc3f1a12 | ||
|
|
5bfcd0485e | ||
|
|
cae641ff26 | ||
|
|
254ebb979c | ||
|
|
ecb923da9c | ||
|
|
40255ab002 | ||
|
|
e4fbf3e2cc | ||
|
|
de276de1c1 | ||
|
|
7edb51f3a5 | ||
|
|
c835bc85c2 | ||
|
|
285b1241e3 | ||
|
|
c356290c8d | ||
|
|
76c0bc06d5 | ||
|
|
b90791e950 | ||
|
|
1e9ac5a7cf | ||
|
|
0b84b9fd8a | ||
|
|
f671997ef7 | ||
|
|
bd41e8292a | ||
|
|
0669c1fcd1 | ||
|
|
e0e55bc550 | ||
|
|
c3ba645237 | ||
|
|
a5a8a6175f | ||
|
|
a7dafe2f41 | ||
|
|
9f374c8252 | ||
|
|
72e506b22e | ||
|
|
ea52f82455 | ||
|
|
4193aa9f81 | ||
|
|
d08a338c3b | ||
|
|
124409d075 | ||
|
|
8df7dfd2a7 |
@@ -70,6 +70,27 @@ jobs:
|
||||
- run: sudo pip install pytest codecov pytest-cov
|
||||
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||
- run: codecov
|
||||
build_py3_custom_tokenizers:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest
|
||||
- run: sudo pip install mecab-python3
|
||||
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
|
||||
build_py2_custom_tokenizers:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:2.7
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest
|
||||
- run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
|
||||
- run: sudo pip install mecab-python
|
||||
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
|
||||
deploy_doc:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
@@ -82,6 +103,16 @@ jobs:
|
||||
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||
- run: ./.circleci/deploy.sh
|
||||
repository_consistency:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
resource_class: small
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install requests
|
||||
- run: python ./utils/link_tester.py
|
||||
workflow_filters: &workflow_filters
|
||||
filters:
|
||||
branches:
|
||||
@@ -91,6 +122,9 @@ workflows:
|
||||
version: 2
|
||||
build_and_test:
|
||||
jobs:
|
||||
- repository_consistency
|
||||
- build_py3_custom_tokenizers
|
||||
- build_py2_custom_tokenizers
|
||||
- build_py3_torch_and_tf
|
||||
- build_py3_torch
|
||||
- build_py3_tf
|
||||
|
||||
17
README.md
17
README.md
@@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime
|
||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||
| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
||||
| [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -101,17 +101,26 @@ pip install [--editable] .
|
||||
|
||||
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||
|
||||
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
||||
These tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||
|
||||
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
||||
|
||||
You can run the tests from the root of the cloned repository with the commands:
|
||||
|
||||
```bash
|
||||
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python -m pytest -sv ./transformers/tests/
|
||||
python -m pytest -sv ./examples/
|
||||
```
|
||||
|
||||
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||
|
||||
### Do you want to run a Transformer model on a mobile device?
|
||||
|
||||
You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
|
||||
@@ -131,10 +140,10 @@ At some point in the future, you'll be able to seamlessly move from pre-training
|
||||
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
|
||||
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
|
||||
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
11. **[ALBERT](https://github.com/google-research/google-research/tree/master/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||
|
||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||
|
||||
@@ -26,7 +26,7 @@ author = u'huggingface'
|
||||
# The short X.Y version
|
||||
version = u''
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = u'2.2.1'
|
||||
release = u'2.2.2'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
@@ -49,7 +49,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
||||
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
|
||||
9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
||||
11. `ALBERT <https://github.com/pytorch/fairseq/tree/master/examples/albert>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
@@ -24,15 +24,24 @@ pip install [--editable] .
|
||||
|
||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||
|
||||
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
||||
Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||
|
||||
Run all the tests from the root of the cloned repository with the commands:
|
||||
|
||||
```bash
|
||||
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
``` bash
|
||||
python -m pytest -sv ./transformers/tests/
|
||||
python -m pytest -sv ./examples/
|
||||
```
|
||||
|
||||
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||
|
||||
## OpenAI GPT original tokenization workflow
|
||||
|
||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
||||
|
||||
@@ -5,6 +5,7 @@ The ``.optimization`` module provides:
|
||||
|
||||
- an optimizer with weight decay fixed that can be used to fine-tuned models, and
|
||||
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
|
||||
- a gradient accumulation class to accumulate the gradients of multiple batches
|
||||
|
||||
``AdamW``
|
||||
~~~~~~~~~~~~~~~~
|
||||
@@ -12,6 +13,15 @@ The ``.optimization`` module provides:
|
||||
.. autoclass:: transformers.AdamW
|
||||
:members:
|
||||
|
||||
``AdamWeightDecay``
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AdamWeightDecay
|
||||
:members:
|
||||
|
||||
.. autofunction:: transformers.create_optimizer
|
||||
:members:
|
||||
|
||||
Schedules
|
||||
----------------------------------------------------
|
||||
|
||||
@@ -49,3 +59,17 @@ Learning Rate Schedules
|
||||
.. image:: /imgs/warmup_linear_schedule.png
|
||||
:target: /imgs/warmup_linear_schedule.png
|
||||
:alt:
|
||||
|
||||
``Warmup``
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.Warmup
|
||||
:members:
|
||||
|
||||
Gradient Strategies
|
||||
----------------------------------------------------
|
||||
|
||||
``GradientAccumulator``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.GradientAccumulator
|
||||
|
||||
@@ -54,8 +54,7 @@ Additionally, the following method can be used to load values from a data file
|
||||
Example usage
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
An example using these processors is given in the
|
||||
`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||
An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||
|
||||
|
||||
XNLI
|
||||
@@ -74,8 +73,81 @@ This library hosts the processor to load the XNLI data:
|
||||
|
||||
Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
|
||||
|
||||
Example usage
|
||||
An example using these processors is given in the
|
||||
`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
|
||||
|
||||
|
||||
SQuAD
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
|
||||
the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
|
||||
`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside
|
||||
the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
|
||||
|
||||
This library hosts a processor for each of the two versions:
|
||||
|
||||
Processors
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
An example using these processors is given in the
|
||||
`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
|
||||
Those processors are:
|
||||
- :class:`~transformers.data.processors.utils.SquadV1Processor`
|
||||
- :class:`~transformers.data.processors.utils.SquadV2Processor`
|
||||
|
||||
They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
|
||||
|
||||
.. autoclass:: transformers.data.processors.squad.SquadProcessor
|
||||
:members:
|
||||
|
||||
Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
|
||||
that can be used as model inputs.
|
||||
|
||||
.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
|
||||
|
||||
These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
|
||||
Examples are given below.
|
||||
|
||||
|
||||
Example usage
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Here is an example using the processors as well as the conversion method using data files:
|
||||
|
||||
Example::
|
||||
|
||||
# Loading a V2 processor
|
||||
processor = SquadV2Processor()
|
||||
examples = processor.get_dev_examples(squad_v2_data_dir)
|
||||
|
||||
# Loading a V1 processor
|
||||
processor = SquadV1Processor()
|
||||
examples = processor.get_dev_examples(squad_v1_data_dir)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
|
||||
Using `tensorflow_datasets` is as easy as using a data file:
|
||||
|
||||
Example::
|
||||
|
||||
# tensorflow_datasets only handle Squad V1.
|
||||
tfds_examples = tfds.load("squad")
|
||||
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
|
||||
|
||||
Another example using these processors is given in the
|
||||
`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
|
||||
|
||||
@@ -104,6 +104,6 @@ for batch in train_data:
|
||||
loss = model(batch)
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
||||
scheduler.step()
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
```
|
||||
|
||||
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on uncased German text by DBMDZ |
|
||||
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. |
|
||||
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. |
|
||||
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text. Text is tokenized into characters. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | OpenAI GPT English model |
|
||||
@@ -155,6 +173,10 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``distilbert-base-multilingual-cased`` | | 6-layer, 768-hidden, 12-heads, 134M parameters |
|
||||
| | | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters |
|
||||
| | | | Salesforce's Large-sized CTRL English model |
|
||||
@@ -165,35 +187,35 @@ Here is the full list of the currently provided pretrained models together with
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||
| | | | ALBERT base model |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||
| | | | ALBERT large model |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||
| | | | ALBERT xlarge model |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||
| | | | ALBERT xxlarge model |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||
| | | | ALBERT base model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||
| | | | ALBERT large model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
|
||||
|
||||
@@ -4,12 +4,14 @@ In this section a few examples are put together. All of these examples work for
|
||||
similar API between the different models.
|
||||
|
||||
**Important**
|
||||
To run the latest versions of the examples, you have to install from source. Execute the following steps in a new virtual environment:
|
||||
To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
|
||||
Execute the following steps in a new virtual environment:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
pip install [--editable] .
|
||||
pip install -r ./examples/requirements.txt
|
||||
```
|
||||
|
||||
| Section | Description |
|
||||
@@ -22,7 +24,6 @@ pip install [--editable] .
|
||||
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
||||
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
||||
| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
|
||||
| [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. |
|
||||
|
||||
## TensorFlow 2.0 Bert models on GLUE
|
||||
|
||||
@@ -465,7 +466,8 @@ Training with the previously defined hyper-parameters yields the following resul
|
||||
|
||||
## Named Entity Recognition
|
||||
|
||||
Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py).
|
||||
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
|
||||
[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
|
||||
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
||||
Details and results for the fine-tuning provided by @stefan-it.
|
||||
|
||||
@@ -510,7 +512,7 @@ The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so
|
||||
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
||||
```
|
||||
|
||||
### Training
|
||||
### Prepare the run
|
||||
|
||||
Additional environment variables must be set:
|
||||
|
||||
@@ -522,6 +524,8 @@ export SAVE_STEPS=750
|
||||
export SEED=1
|
||||
```
|
||||
|
||||
### Run the Pytorch version
|
||||
|
||||
To start training, just run:
|
||||
|
||||
```bash
|
||||
@@ -542,7 +546,7 @@ python3 run_ner.py --data_dir ./ \
|
||||
|
||||
If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
||||
|
||||
### Evaluation
|
||||
#### Evaluation
|
||||
|
||||
Evaluation on development dataset outputs the following for our example:
|
||||
|
||||
@@ -564,7 +568,7 @@ On the test dataset the following results could be achieved:
|
||||
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
|
||||
```
|
||||
|
||||
### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
|
||||
#### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
|
||||
|
||||
Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
|
||||
|
||||
@@ -574,32 +578,70 @@ Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) a
|
||||
| `roberta-large` | 95.96 | 91.87
|
||||
| `distilbert-base-uncased` | 94.34 | 90.32
|
||||
|
||||
## Abstractive summarization
|
||||
### Run the Tensorflow 2 version
|
||||
|
||||
Based on the script
|
||||
[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
|
||||
|
||||
Before running this script you should download **both** CNN and Daily Mail
|
||||
datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the
|
||||
links next to "Stories") in the same folder. Then uncompress the archives by running:
|
||||
To start training, just run:
|
||||
|
||||
```bash
|
||||
tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
|
||||
python3 run_tf_ner.py --data_dir ./ \
|
||||
--model_type bert \
|
||||
--labels ./labels.txt \
|
||||
--model_name_or_path $BERT_MODEL \
|
||||
--output_dir $OUTPUT_DIR \
|
||||
--max_seq_length $MAX_LENGTH \
|
||||
--num_train_epochs $NUM_EPOCHS \
|
||||
--per_device_train_batch_size $BATCH_SIZE \
|
||||
--save_steps $SAVE_STEPS \
|
||||
--seed $SEED \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_predict
|
||||
```
|
||||
|
||||
note that the finetuning script **will not work** if you do not download both
|
||||
datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both
|
||||
archive.
|
||||
Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
||||
|
||||
#### Evaluation
|
||||
|
||||
Evaluation on development dataset outputs the following for our example:
|
||||
```bash
|
||||
export DATA_PATH=/path/to/dataset/
|
||||
precision recall f1-score support
|
||||
|
||||
python run_summarization_finetuning.py \
|
||||
--output_dir=output \
|
||||
--model_type=bert2bert \
|
||||
--model_name_or_path=bert2bert \
|
||||
--do_train \
|
||||
--data_path=$DATA_PATH \
|
||||
LOCderiv 0.7619 0.6154 0.6809 52
|
||||
PERpart 0.8724 0.8997 0.8858 4057
|
||||
OTHpart 0.9360 0.9466 0.9413 711
|
||||
ORGpart 0.7015 0.6989 0.7002 269
|
||||
LOCpart 0.7668 0.8488 0.8057 496
|
||||
LOC 0.8745 0.9191 0.8963 235
|
||||
ORGderiv 0.7723 0.8571 0.8125 91
|
||||
OTHderiv 0.4800 0.6667 0.5581 18
|
||||
OTH 0.5789 0.6875 0.6286 16
|
||||
PERderiv 0.5385 0.3889 0.4516 18
|
||||
PER 0.5000 0.5000 0.5000 2
|
||||
ORG 0.0000 0.0000 0.0000 3
|
||||
|
||||
micro avg 0.8574 0.8862 0.8715 5968
|
||||
macro avg 0.8575 0.8862 0.8713 5968
|
||||
```
|
||||
|
||||
On the test dataset the following results could be achieved:
|
||||
```bash
|
||||
precision recall f1-score support
|
||||
|
||||
PERpart 0.8847 0.8944 0.8896 9397
|
||||
OTHpart 0.9376 0.9353 0.9365 1639
|
||||
ORGpart 0.7307 0.7044 0.7173 697
|
||||
LOC 0.9133 0.9394 0.9262 561
|
||||
LOCpart 0.8058 0.8157 0.8107 1150
|
||||
ORG 0.0000 0.0000 0.0000 8
|
||||
OTHderiv 0.5882 0.4762 0.5263 42
|
||||
PERderiv 0.6571 0.5227 0.5823 44
|
||||
OTH 0.4906 0.6667 0.5652 39
|
||||
ORGderiv 0.7016 0.7791 0.7383 172
|
||||
LOCderiv 0.8256 0.6514 0.7282 109
|
||||
PER 0.0000 0.0000 0.0000 11
|
||||
|
||||
micro avg 0.8722 0.8774 0.8748 13869
|
||||
macro avg 0.8712 0.8774 0.8740 13869
|
||||
```
|
||||
|
||||
## XNLI
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
|
||||
|
||||
**December 6th, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||
|
||||
**November 19th, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
|
||||
|
||||
**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
|
||||
@@ -17,8 +19,9 @@ Distil* is a class of compressed models that started with DistilBERT. DistilBERT
|
||||
|
||||
We have applied the same method to other Transformer architectures and released the weights:
|
||||
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
|
||||
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base` performance on GLUE while being twice faster and 35% smaller.
|
||||
- and more to come! 🤗🤗🤗
|
||||
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
|
||||
- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
|
||||
- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||
|
||||
For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
|
||||
|
||||
@@ -29,7 +32,7 @@ Here are the results on the dev sets of GLUE:
|
||||
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
||||
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
|
||||
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
|
||||
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
|
||||
|
||||
<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
|
||||
@@ -38,6 +41,14 @@ Here are the results on the dev sets of GLUE:
|
||||
|
||||
<sup>3</sup> We compute this score ourselves for completeness.
|
||||
|
||||
Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
|
||||
|
||||
| Model | English | Spanish | Chinese | German | Arabic | Urdu |
|
||||
| :---: | :---: | :---: | :---: | :---: | :---: | :---:|
|
||||
| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 |
|
||||
| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 |
|
||||
| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 |
|
||||
|
||||
## Setup
|
||||
|
||||
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||
@@ -54,7 +65,7 @@ Transformers includes five pre-trained Distil* models, currently only provided f
|
||||
- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
|
||||
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
|
||||
- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
|
||||
- and more to come! 🤗🤗🤗
|
||||
- `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.
|
||||
|
||||
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
||||
|
||||
@@ -70,6 +81,7 @@ last_hidden_states = outputs[0] # The last hidden-state is the first element of
|
||||
Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
|
||||
- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
|
||||
- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
|
||||
- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
|
||||
|
||||
|
||||
## How to train Distil*
|
||||
|
||||
@@ -21,7 +21,6 @@ import psutil
|
||||
import time
|
||||
from tqdm import trange, tqdm
|
||||
import numpy as np
|
||||
import psutil
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
@@ -3,4 +3,4 @@ tensorboard>=1.14.0
|
||||
tensorboardX==1.8
|
||||
psutil==5.6.3
|
||||
scipy==1.3.1
|
||||
transformers==2.0.0
|
||||
transformers
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
# PPLM
|
||||
# Plug and Play Language Models: a Simple Approach to Controlled Text Generation
|
||||
|
||||
Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
|
||||
|
||||
This folder contains the original code used to run the Plug and Play Language Model (PPLM).
|
||||

|
||||
|
||||
## Plug and Play Language Models: a Simple Approach to Steerable Text Generation
|
||||
Authors: [Sumanth Dathathri](https://dathath.github.io/), Andrea Madotto, Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
|
||||
|
||||
PPLM allows a user to flexibly plug in one or more tiny attribute models representing the desired steering objective into a large, unconditional LM. The method has the key property that it uses the LM _as is_---no training or fine-tuning is required---which enables researchers to leverage best-in-class LMs even if they do not have the extensive hardware required to train them.
|
||||
|
||||
Paper link:
|
||||
Paper link: https://arxiv.org/abs/1912.02164
|
||||
|
||||
Blog link: https://eng.uber.com/pplm
|
||||
|
||||
Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
|
||||
|
||||
|
||||
## Setup
|
||||
|
||||
@@ -27,7 +25,7 @@ cd examples/pplm
|
||||
### Example command for bag-of-words control
|
||||
|
||||
```bash
|
||||
python run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5 --num_iterations 3 --num_samples 1 --stepsize 0.01 --window_length 5 --kl_scale 0.01 --gm_scale 0.95
|
||||
python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
|
||||
```
|
||||
|
||||
### Tuning hyperparameters for bag-of-words control
|
||||
@@ -45,7 +43,7 @@ python run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5
|
||||
### Example command for discriminator based sentiment control
|
||||
|
||||
```bash
|
||||
python run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length 10 --gamma 1.0 --num_iterations 10 --num_samples 1 --stepsize 0.03 --kl_scale 0.01 --gm_scale 0.95
|
||||
python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
|
||||
```
|
||||
|
||||
### Tuning hyperparameters for discriminator control
|
||||
@@ -54,8 +52,3 @@ python run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length
|
||||
|
||||
2. Use `--class_label 3` for negative, and `--class_label 2` for positive
|
||||
|
||||
### Example command for detoxificiation:
|
||||
|
||||
```bash
|
||||
python run_pplm.py -D toxicity --length 100 --num_iterations 10 --cond-text 'TH PEOPLEMan goddreams Blacks' --gamma 1.0 --num_samples 10 --stepsize 0.02
|
||||
```
|
||||
|
||||
18
examples/pplm/pplm_classification_head.py
Normal file
18
examples/pplm/pplm_classification_head.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import torch
|
||||
|
||||
class ClassificationHead(torch.nn.Module):
|
||||
"""Classification Head for transformer encoders"""
|
||||
|
||||
def __init__(self, class_size, embed_size):
|
||||
super(ClassificationHead, self).__init__()
|
||||
self.class_size = class_size
|
||||
self.embed_size = embed_size
|
||||
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
|
||||
# self.mlp2 = (torch.nn.Linear(embed_size, class_size))
|
||||
self.mlp = torch.nn.Linear(embed_size, class_size)
|
||||
|
||||
def forward(self, hidden_state):
|
||||
# hidden_state = F.relu(self.mlp1(hidden_state))
|
||||
# hidden_state = self.mlp2(hidden_state)
|
||||
logits = self.mlp(hidden_state)
|
||||
return logits
|
||||
@@ -1,18 +1,19 @@
|
||||
#! /usr/bin/env python3
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Uber AI Team Authors.
|
||||
|
||||
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
"""
|
||||
Example command with bag of words:
|
||||
@@ -33,10 +34,10 @@ import torch.nn.functional as F
|
||||
from torch.autograd import Variable
|
||||
from tqdm import trange
|
||||
|
||||
from examples.run_pplm_discrim_train import ClassificationHead
|
||||
from transformers import GPT2Tokenizer
|
||||
from transformers.file_utils import cached_path
|
||||
from transformers.modeling_gpt2 import GPT2LMHeadModel
|
||||
from pplm_classification_head import ClassificationHead
|
||||
|
||||
PPLM_BOW = 1
|
||||
PPLM_DISCRIM = 2
|
||||
@@ -45,12 +46,9 @@ SMALL_CONST = 1e-15
|
||||
BIG_CONST = 1e10
|
||||
|
||||
BAG_OF_WORDS_ARCHIVE_MAP = {
|
||||
'kitchen': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/kitchen.txt",
|
||||
'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
|
||||
'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
|
||||
'monsters': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/monsters.txt",
|
||||
'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
|
||||
'positive_words': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/positive_words.txt",
|
||||
'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
|
||||
'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
|
||||
'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
|
||||
@@ -74,14 +72,6 @@ DISCRIMINATOR_MODELS_PARAMS = {
|
||||
"default_class": 3,
|
||||
"pretrained_model": "gpt2-medium",
|
||||
},
|
||||
"toxicity": {
|
||||
"url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/toxic_classifier_head.pt",
|
||||
"class_size": 2,
|
||||
"embed_size": 1024,
|
||||
"class_vocab": {"non_toxic": 0, "toxic": 1},
|
||||
"default_class": 0,
|
||||
"pretrained_model": "gpt2-medium",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,19 @@
|
||||
#! /usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
# This code is licensed under a non-commercial license.
|
||||
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
#http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
@@ -21,6 +33,7 @@ from torchtext import datasets
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
from pplm_classification_head import ClassificationHead
|
||||
|
||||
torch.manual_seed(0)
|
||||
np.random.seed(0)
|
||||
@@ -29,22 +42,6 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha
|
||||
max_length_seq = 100
|
||||
|
||||
|
||||
class ClassificationHead(torch.nn.Module):
|
||||
"""Classification Head for transformer encoders"""
|
||||
|
||||
def __init__(self, class_size, embed_size):
|
||||
super(ClassificationHead, self).__init__()
|
||||
self.class_size = class_size
|
||||
self.embed_size = embed_size
|
||||
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
|
||||
# self.mlp2 = (torch.nn.Linear(embed_size, class_size))
|
||||
self.mlp = torch.nn.Linear(embed_size, class_size)
|
||||
|
||||
def forward(self, hidden_state):
|
||||
# hidden_state = F.relu(self.mlp1(hidden_state))
|
||||
# hidden_state = self.mlp2(hidden_state)
|
||||
logits = self.mlp(hidden_state)
|
||||
return logits
|
||||
|
||||
|
||||
class Discriminator(torch.nn.Module):
|
||||
|
||||
@@ -380,7 +380,7 @@ def main():
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
|
||||
@@ -188,6 +188,13 @@ def train(args, train_dataset, model, tokenizer):
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
@@ -216,14 +223,37 @@ def train(args, train_dataset, model, tokenizer):
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model_to_resize = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_resize.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
|
||||
inputs = inputs.to(args.device)
|
||||
labels = labels.to(args.device)
|
||||
@@ -271,11 +301,17 @@ def train(args, train_dataset, model, tokenizer):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
_rotate_checkpoints(args, checkpoint_prefix)
|
||||
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
|
||||
from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
@@ -23,11 +25,9 @@ import os
|
||||
import random
|
||||
import glob
|
||||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
try:
|
||||
@@ -44,18 +44,11 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
||||
XLNetForQuestionAnswering,
|
||||
XLNetTokenizer,
|
||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
|
||||
AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
|
||||
AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
|
||||
XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
|
||||
)
|
||||
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||
|
||||
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||
RawResult, write_predictions,
|
||||
RawResultExtended, write_predictions_extended)
|
||||
|
||||
# The follwing import is the official SQuAD evaluation script (2.0).
|
||||
# You can remove it from the dependencies if you are using this script outside of the library
|
||||
# We've added it here for automated tests (see examples/test_examples.py file)
|
||||
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -67,7 +60,8 @@ MODEL_CLASSES = {
|
||||
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
|
||||
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
|
||||
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
|
||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
|
||||
}
|
||||
|
||||
def set_seed(args):
|
||||
@@ -100,14 +94,16 @@ def train(args, train_dataset, model, tokenizer):
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
@@ -135,20 +131,26 @@ def train(args, train_dataset, model, tokenizer):
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'start_positions': batch[3],
|
||||
'end_positions': batch[4]}
|
||||
|
||||
inputs = {
|
||||
'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'start_positions': batch[3],
|
||||
'end_positions': batch[4]
|
||||
}
|
||||
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
|
||||
|
||||
if args.model_type in ['xlnet', 'xlm']:
|
||||
inputs.update({'cls_index': batch[5],
|
||||
'p_mask': batch[6]})
|
||||
inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
|
||||
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
@@ -175,8 +177,8 @@ def train(args, train_dataset, model, tokenizer):
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Log metrics
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
@@ -185,8 +187,8 @@ def train(args, train_dataset, model, tokenizer):
|
||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
# Save model checkpoint
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
@@ -215,6 +217,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
|
||||
# Note that DistributedSampler samples randomly
|
||||
eval_sampler = SequentialSampler(dataset)
|
||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
@@ -227,38 +230,59 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = %d", len(dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
|
||||
all_results = []
|
||||
start_time = timeit.default_timer()
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
model.eval()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1]
|
||||
}
|
||||
inputs = {
|
||||
'input_ids': batch[0],
|
||||
'attention_mask': batch[1]
|
||||
}
|
||||
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
|
||||
|
||||
example_indices = batch[3]
|
||||
|
||||
# XLNet and XLM use more arguments for their predictions
|
||||
if args.model_type in ['xlnet', 'xlm']:
|
||||
inputs.update({'cls_index': batch[4],
|
||||
'p_mask': batch[5]})
|
||||
inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
|
||||
|
||||
outputs = model(**inputs)
|
||||
|
||||
for i, example_index in enumerate(example_indices):
|
||||
eval_feature = features[example_index.item()]
|
||||
unique_id = int(eval_feature.unique_id)
|
||||
if args.model_type in ['xlnet', 'xlm']:
|
||||
# XLNet uses a more complex post-processing procedure
|
||||
result = RawResultExtended(unique_id = unique_id,
|
||||
start_top_log_probs = to_list(outputs[0][i]),
|
||||
start_top_index = to_list(outputs[1][i]),
|
||||
end_top_log_probs = to_list(outputs[2][i]),
|
||||
end_top_index = to_list(outputs[3][i]),
|
||||
cls_logits = to_list(outputs[4][i]))
|
||||
|
||||
output = [to_list(output[i]) for output in outputs]
|
||||
|
||||
# Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
|
||||
# models only use two.
|
||||
if len(output) >= 5:
|
||||
start_logits = output[0]
|
||||
start_top_index = output[1]
|
||||
end_logits = output[2]
|
||||
end_top_index = output[3]
|
||||
cls_logits = output[4]
|
||||
|
||||
result = SquadResult(
|
||||
unique_id, start_logits, end_logits,
|
||||
start_top_index=start_top_index,
|
||||
end_top_index=end_top_index,
|
||||
cls_logits=cls_logits
|
||||
)
|
||||
|
||||
else:
|
||||
result = RawResult(unique_id = unique_id,
|
||||
start_logits = to_list(outputs[0][i]),
|
||||
end_logits = to_list(outputs[1][i]))
|
||||
start_logits, end_logits = output
|
||||
result = SquadResult(
|
||||
unique_id, start_logits, end_logits
|
||||
)
|
||||
|
||||
all_results.append(result)
|
||||
|
||||
evalTime = timeit.default_timer() - start_time
|
||||
@@ -267,84 +291,88 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
# Compute predictions
|
||||
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
|
||||
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
|
||||
|
||||
if args.version_2_with_negative:
|
||||
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
|
||||
else:
|
||||
output_null_log_odds_file = None
|
||||
|
||||
# XLNet and XLM use a more complex post-processing procedure
|
||||
if args.model_type in ['xlnet', 'xlm']:
|
||||
# XLNet uses a more complex post-processing procedure
|
||||
write_predictions_extended(examples, features, all_results, args.n_best_size,
|
||||
start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
|
||||
end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
|
||||
|
||||
predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
|
||||
args.max_answer_length, output_prediction_file,
|
||||
output_nbest_file, output_null_log_odds_file, args.predict_file,
|
||||
model.config.start_n_top, model.config.end_n_top,
|
||||
output_nbest_file, output_null_log_odds_file,
|
||||
start_n_top, end_n_top,
|
||||
args.version_2_with_negative, tokenizer, args.verbose_logging)
|
||||
else:
|
||||
write_predictions(examples, features, all_results, args.n_best_size,
|
||||
predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
|
||||
args.max_answer_length, args.do_lower_case, output_prediction_file,
|
||||
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
|
||||
args.version_2_with_negative, args.null_score_diff_threshold)
|
||||
|
||||
# Evaluate with the official SQuAD script
|
||||
evaluate_options = EVAL_OPTS(data_file=args.predict_file,
|
||||
pred_file=output_prediction_file,
|
||||
na_prob_file=output_null_log_odds_file)
|
||||
results = evaluate_on_squad(evaluate_options)
|
||||
# Compute the F1 and exact scores.
|
||||
results = squad_evaluate(examples, predictions)
|
||||
return results
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
||||
if args.local_rank not in [-1, 0] and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
input_file = args.predict_file if evaluate else args.train_file
|
||||
cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
|
||||
input_dir = args.data_dir if args.data_dir else "."
|
||||
cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
|
||||
'dev' if evaluate else 'train',
|
||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||
str(args.max_seq_length)))
|
||||
str(args.max_seq_length))
|
||||
)
|
||||
|
||||
# Init features and dataset from cache if it exists
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
features_and_dataset = torch.load(cached_features_file)
|
||||
features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", input_file)
|
||||
examples = read_squad_examples(input_file=input_file,
|
||||
is_training=not evaluate,
|
||||
version_2_with_negative=args.version_2_with_negative)
|
||||
features = convert_examples_to_features(examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=not evaluate,
|
||||
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
|
||||
pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
|
||||
cls_token_at_end=True if args.model_type in ['xlnet'] else False,
|
||||
sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
|
||||
logger.info("Creating features from dataset file at %s", input_dir)
|
||||
|
||||
if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
|
||||
try:
|
||||
import tensorflow_datasets as tfds
|
||||
except ImportError:
|
||||
raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
|
||||
|
||||
if args.version_2_with_negative:
|
||||
logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
|
||||
|
||||
tfds_examples = tfds.load("squad")
|
||||
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||
else:
|
||||
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
|
||||
|
||||
if evaluate:
|
||||
examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
|
||||
else:
|
||||
examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
|
||||
|
||||
features, dataset = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=not evaluate,
|
||||
return_dataset='pt'
|
||||
)
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
torch.save({"features": features, "dataset": dataset}, cached_features_file)
|
||||
|
||||
if args.local_rank == 0 and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
||||
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||
if evaluate:
|
||||
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||
all_example_index, all_cls_index, all_p_mask)
|
||||
else:
|
||||
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||
all_start_positions, all_end_positions,
|
||||
all_cls_index, all_p_mask)
|
||||
|
||||
if output_examples:
|
||||
return dataset, examples, features
|
||||
return dataset
|
||||
@@ -354,10 +382,6 @@ def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--train_file", default=None, type=str, required=True,
|
||||
help="SQuAD json for training. E.g., train-v1.1.json")
|
||||
parser.add_argument("--predict_file", default=None, type=str, required=True,
|
||||
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
@@ -366,6 +390,15 @@ def main():
|
||||
help="The output directory where the model checkpoints and predictions will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str,
|
||||
help="The input data dir. Should contain the .json files for the task." +
|
||||
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||
parser.add_argument("--train_file", default=None, type=str,
|
||||
help="The input training file. If a data dir is specified, will look for the file there" +
|
||||
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||
parser.add_argument("--predict_file", default=None, type=str,
|
||||
help="The input evaluation file. If a data dir is specified, will look for the file there" +
|
||||
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
@@ -547,10 +580,16 @@ def main():
|
||||
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||
|
||||
if args.do_train:
|
||||
logger.info("Loading checkpoints saved during training for evaluation")
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||
else:
|
||||
logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
|
||||
checkpoints = [args.model_name_or_path]
|
||||
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
|
||||
|
||||
@@ -1,492 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019 The HuggingFace Inc. team.
|
||||
# Copyright (c) 2019 The HuggingFace Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Finetuning seq2seq models for sequence generation."""
|
||||
|
||||
import argparse
|
||||
import functools
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm, trange
|
||||
import torch
|
||||
from torch.optim import Adam
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
BertForMaskedLM,
|
||||
BertConfig,
|
||||
PreTrainedEncoderDecoder,
|
||||
Model2Model,
|
||||
)
|
||||
|
||||
from utils_summarization import (
|
||||
CNNDailyMailDataset,
|
||||
encode_for_summarization,
|
||||
fit_to_block_size,
|
||||
build_lm_labels,
|
||||
build_mask,
|
||||
compute_token_type_ids,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
|
||||
# ------------
|
||||
# Load dataset
|
||||
# ------------
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer):
|
||||
dataset = CNNDailyMailDataset(tokenizer, data_dir=args.data_dir)
|
||||
return dataset
|
||||
|
||||
|
||||
def collate(data, tokenizer, block_size):
|
||||
""" List of tuple as an input. """
|
||||
# remove the files with empty an story/summary, encode and fit to block
|
||||
data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
|
||||
data = [
|
||||
encode_for_summarization(story, summary, tokenizer) for story, summary in data
|
||||
]
|
||||
data = [
|
||||
(
|
||||
fit_to_block_size(story, block_size, tokenizer.pad_token_id),
|
||||
fit_to_block_size(summary, block_size, tokenizer.pad_token_id),
|
||||
)
|
||||
for story, summary in data
|
||||
]
|
||||
|
||||
stories = torch.tensor([story for story, summary in data])
|
||||
summaries = torch.tensor([summary for story, summary in data])
|
||||
encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
|
||||
encoder_mask = build_mask(stories, tokenizer.pad_token_id)
|
||||
decoder_mask = build_mask(summaries, tokenizer.pad_token_id)
|
||||
lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id)
|
||||
|
||||
return (
|
||||
stories,
|
||||
summaries,
|
||||
encoder_token_type_ids,
|
||||
encoder_mask,
|
||||
decoder_mask,
|
||||
lm_labels,
|
||||
)
|
||||
|
||||
|
||||
# ----------
|
||||
# Optimizers
|
||||
# ----------
|
||||
|
||||
|
||||
class BertSumOptimizer(object):
|
||||
""" Specific optimizer for BertSum.
|
||||
|
||||
As described in [1], the authors fine-tune BertSum for abstractive
|
||||
summarization using two Adam Optimizers with different warm-up steps and
|
||||
learning rate. They also use a custom learning rate scheduler.
|
||||
|
||||
[1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
|
||||
arXiv preprint arXiv:1908.08345 (2019).
|
||||
"""
|
||||
|
||||
def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
|
||||
self.encoder = model.encoder
|
||||
self.decoder = model.decoder
|
||||
self.lr = lr
|
||||
self.warmup_steps = warmup_steps
|
||||
|
||||
self.optimizers = {
|
||||
"encoder": Adam(
|
||||
model.encoder.parameters(),
|
||||
lr=lr["encoder"],
|
||||
betas=(beta_1, beta_2),
|
||||
eps=eps,
|
||||
),
|
||||
"decoder": Adam(
|
||||
model.decoder.parameters(),
|
||||
lr=lr["decoder"],
|
||||
betas=(beta_1, beta_2),
|
||||
eps=eps,
|
||||
),
|
||||
}
|
||||
|
||||
self._step = 0
|
||||
|
||||
def _update_rate(self, stack):
|
||||
return self.lr[stack] * min(
|
||||
self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-0.5)
|
||||
)
|
||||
|
||||
def zero_grad(self):
|
||||
self.optimizer_decoder.zero_grad()
|
||||
self.optimizer_encoder.zero_grad()
|
||||
|
||||
def step(self):
|
||||
self._step += 1
|
||||
for stack, optimizer in self.optimizers.items():
|
||||
new_rate = self._update_rate(stack)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group["lr"] = new_rate
|
||||
optimizer.step()
|
||||
|
||||
|
||||
# ------------
|
||||
# Train
|
||||
# ------------
|
||||
|
||||
|
||||
def train(args, model, tokenizer):
|
||||
""" Fine-tune the pretrained model on the corpus. """
|
||||
set_seed(args)
|
||||
|
||||
# Load the data
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
train_dataset = load_and_cache_examples(args, tokenizer)
|
||||
train_sampler = RandomSampler(train_dataset)
|
||||
model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
sampler=train_sampler,
|
||||
batch_size=args.train_batch_size,
|
||||
collate_fn=model_collate_fn,
|
||||
)
|
||||
|
||||
# Training schedule
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = t_total // (
|
||||
len(train_dataloader) // args.gradient_accumulation_steps + 1
|
||||
)
|
||||
else:
|
||||
t_total = (
|
||||
len(train_dataloader)
|
||||
// args.gradient_accumulation_steps
|
||||
* args.num_train_epochs
|
||||
)
|
||||
|
||||
# Prepare the optimizer
|
||||
lr = {"encoder": 0.002, "decoder": 0.2}
|
||||
warmup_steps = {"encoder": 20000, "decoder": 10000}
|
||||
optimizer = BertSumOptimizer(model, lr, warmup_steps)
|
||||
|
||||
# Train
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(
|
||||
" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
|
||||
)
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps
|
||||
# * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
model.zero_grad()
|
||||
train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)
|
||||
|
||||
global_step = 0
|
||||
tr_loss = 0.0
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
|
||||
|
||||
source = source.to(args.device)
|
||||
target = target.to(args.device)
|
||||
encoder_token_type_ids = encoder_token_type_ids.to(args.device)
|
||||
encoder_mask = encoder_mask.to(args.device)
|
||||
decoder_mask = decoder_mask.to(args.device)
|
||||
lm_labels = lm_labels.to(args.device)
|
||||
|
||||
model.train()
|
||||
outputs = model(
|
||||
source,
|
||||
target,
|
||||
encoder_token_type_ids=encoder_token_type_ids,
|
||||
encoder_attention_mask=encoder_mask,
|
||||
decoder_attention_mask=decoder_mask,
|
||||
decoder_lm_labels=lm_labels,
|
||||
)
|
||||
|
||||
loss = outputs[0]
|
||||
print(loss)
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss /= args.gradient_accumulation_steps
|
||||
|
||||
loss.backward()
|
||||
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
optimizer.step()
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
# ------------
|
||||
# Train
|
||||
# ------------
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
set_seed(args)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
|
||||
eval_sampler = SequentialSampler(eval_dataset)
|
||||
eval_dataloader = DataLoader(
|
||||
eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
|
||||
)
|
||||
|
||||
# multi-gpu evaluate
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = %d", len(eval_dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
eval_loss = 0.0
|
||||
nb_eval_steps = 0
|
||||
model.eval()
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
|
||||
|
||||
source = source.to(args.device)
|
||||
target = target.to(args.device)
|
||||
encoder_token_type_ids = encoder_token_type_ids.to(args.device)
|
||||
encoder_mask = encoder_mask.to(args.device)
|
||||
decoder_mask = decoder_mask.to(args.device)
|
||||
lm_labels = lm_labels.to(args.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(
|
||||
source,
|
||||
target,
|
||||
encoder_token_type_ids=encoder_token_type_ids,
|
||||
encoder_attention_mask=encoder_mask,
|
||||
decoder_attention_mask=decoder_mask,
|
||||
decoder_lm_labels=lm_labels,
|
||||
)
|
||||
lm_loss = outputs[0]
|
||||
eval_loss += lm_loss.mean().item()
|
||||
nb_eval_steps += 1
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
perplexity = torch.exp(torch.tensor(eval_loss))
|
||||
|
||||
result = {"perplexity": perplexity}
|
||||
|
||||
# Save the evaluation's results
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results {} *****".format(prefix))
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input training data file (a text file).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
# Optional parameters
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_evaluate",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Run model evaluation on out-of-sample data.",
|
||||
)
|
||||
parser.add_argument("--do_train", type=bool, default=False, help="Run training.")
|
||||
parser.add_argument(
|
||||
"--do_overwrite_output_dir",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Whether to overwrite the output dir.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default="bert-base-cased",
|
||||
type=str,
|
||||
help="The model checkpoint to initialize the encoder and decoder's weights with.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default="bert",
|
||||
type=str,
|
||||
help="The decoder architecture to be fine-tuned.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--to_cpu", default=False, type=bool, help="Whether to force training on CPU."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_train_epochs",
|
||||
default=10,
|
||||
type=int,
|
||||
help="Total number of training epochs to perform.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_gpu_train_batch_size",
|
||||
default=4,
|
||||
type=int,
|
||||
help="Batch size per GPU/CPU for training.",
|
||||
)
|
||||
parser.add_argument("--seed", default=42, type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.do_overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Set up training device
|
||||
if args.to_cpu or not torch.cuda.is_available():
|
||||
args.device = torch.device("cpu")
|
||||
args.n_gpu = 0
|
||||
else:
|
||||
args.device = torch.device("cuda")
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
|
||||
# Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
|
||||
config = BertConfig.from_pretrained(args.model_name_or_path)
|
||||
decoder_model = BertForMaskedLM(config)
|
||||
model = Model2Model.from_pretrained(
|
||||
args.model_name_or_path, decoder_model=decoder_model
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
0,
|
||||
args.device,
|
||||
args.n_gpu,
|
||||
False,
|
||||
False,
|
||||
)
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Train the model
|
||||
model.to(args.device)
|
||||
if args.do_train:
|
||||
global_step, tr_loss = train(args, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))
|
||||
|
||||
# Evaluate the model
|
||||
results = {}
|
||||
if args.do_evaluate:
|
||||
checkpoints = []
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
encoder_checkpoint = os.path.join(checkpoint, "encoder")
|
||||
decoder_checkpoint = os.path.join(checkpoint, "decoder")
|
||||
model = PreTrainedEncoderDecoder.from_pretrained(
|
||||
encoder_checkpoint, decoder_checkpoint
|
||||
)
|
||||
model.to(args.device)
|
||||
results = "placeholder"
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
615
examples/run_tf_ner.py
Normal file
615
examples/run_tf_ner.py
Normal file
@@ -0,0 +1,615 @@
|
||||
# coding=utf-8
|
||||
import datetime
|
||||
import os
|
||||
import math
|
||||
import glob
|
||||
import re
|
||||
import tensorflow as tf
|
||||
import collections
|
||||
import numpy as np
|
||||
from seqeval import metrics
|
||||
import _pickle as pickle
|
||||
from absl import logging
|
||||
from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification
|
||||
from transformers import RobertaConfig, RobertaTokenizer, TFRobertaForTokenClassification
|
||||
from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||
from transformers import create_optimizer, GradientAccumulator
|
||||
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
||||
from fastprogress import master_bar, progress_bar
|
||||
from absl import flags
|
||||
from absl import app
|
||||
|
||||
|
||||
ALL_MODELS = sum(
|
||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
|
||||
())
|
||||
|
||||
MODEL_CLASSES = {
|
||||
"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
|
||||
"roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
|
||||
"distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
|
||||
}
|
||||
|
||||
|
||||
flags.DEFINE_string(
|
||||
"data_dir", None,
|
||||
"The input data dir. Should contain the .conll files (or other data files) "
|
||||
"for the task.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"model_type", None,
|
||||
"Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
|
||||
flags.DEFINE_string(
|
||||
"model_name_or_path", None,
|
||||
"Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
|
||||
flags.DEFINE_string(
|
||||
"output_dir", None,
|
||||
"The output directory where the model checkpoints will be written.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"labels", "",
|
||||
"Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"config_name", "",
|
||||
"Pretrained config name or path if not the same as model_name")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"tokenizer_name", "",
|
||||
"Pretrained tokenizer name or path if not the same as model_name")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"cache_dir", "",
|
||||
"Where do you want to store the pre-trained models downloaded from s3")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"max_seq_length", 128,
|
||||
"The maximum total input sentence length after tokenization. "
|
||||
"Sequences longer than this will be truncated, sequences shorter "
|
||||
"will be padded.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"tpu", None,
|
||||
"The Cloud TPU to use for training. This should be either the name "
|
||||
"used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
|
||||
"url.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"num_tpu_cores", 8,
|
||||
"Total number of TPU cores to use.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_train", False,
|
||||
"Whether to run training.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_eval", False,
|
||||
"Whether to run eval on the dev set.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_predict", False,
|
||||
"Whether to run predictions on the test set.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"evaluate_during_training", False,
|
||||
"Whether to run evaluation during training at each logging step.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_lower_case", False,
|
||||
"Set this flag if you are using an uncased model.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"per_device_train_batch_size", 8,
|
||||
"Batch size per GPU/CPU/TPU for training.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"per_device_eval_batch_size", 8,
|
||||
"Batch size per GPU/CPU/TPU for evaluation.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"gradient_accumulation_steps", 1,
|
||||
"Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"learning_rate", 5e-5,
|
||||
"The initial learning rate for Adam.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"weight_decay", 0.0,
|
||||
"Weight decay if we apply some.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"adam_epsilon", 1e-8,
|
||||
"Epsilon for Adam optimizer.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"max_grad_norm", 1.0,
|
||||
"Max gradient norm.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"num_train_epochs", 3,
|
||||
"Total number of training epochs to perform.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"max_steps", -1,
|
||||
"If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"warmup_steps", 0,
|
||||
"Linear warmup over warmup_steps.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"logging_steps", 50,
|
||||
"Log every X updates steps.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"save_steps", 50,
|
||||
"Save checkpoint every X updates steps.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"eval_all_checkpoints", False,
|
||||
"Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"no_cuda", False,
|
||||
"Avoid using CUDA when available")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"overwrite_output_dir", False,
|
||||
"Overwrite the content of the output directory")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"overwrite_cache", False,
|
||||
"Overwrite the cached training and evaluation sets")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"seed", 42,
|
||||
"random seed for initialization")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"fp16", False,
|
||||
"Whether to use 16-bit (mixed) precision instead of 32-bit")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"gpus", "0",
|
||||
"Comma separated list of gpus devices. If only one, switch to single "
|
||||
"gpu strategy, if None takes all the gpus available.")
|
||||
|
||||
|
||||
def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
|
||||
if args['max_steps'] > 0:
|
||||
num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
|
||||
args['num_train_epochs'] = 1
|
||||
else:
|
||||
num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
|
||||
|
||||
writer = tf.summary.create_file_writer("/tmp/mylogs")
|
||||
|
||||
with strategy.scope():
|
||||
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||
optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
|
||||
|
||||
if args['fp16']:
|
||||
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
|
||||
|
||||
loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
|
||||
gradient_accumulator = GradientAccumulator()
|
||||
|
||||
logging.info("***** Running training *****")
|
||||
logging.info(" Num examples = %d", num_train_examples)
|
||||
logging.info(" Num Epochs = %d", args['num_train_epochs'])
|
||||
logging.info(" Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
|
||||
logging.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
train_batch_size * args['gradient_accumulation_steps'])
|
||||
logging.info(" Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
|
||||
logging.info(" Total training steps = %d", num_train_steps)
|
||||
|
||||
model.summary()
|
||||
|
||||
@tf.function
|
||||
def apply_gradients():
|
||||
grads_and_vars = []
|
||||
|
||||
for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
|
||||
if gradient is not None:
|
||||
scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
|
||||
grads_and_vars.append((scaled_gradient, variable))
|
||||
else:
|
||||
grads_and_vars.append((gradient, variable))
|
||||
|
||||
optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
|
||||
gradient_accumulator.reset()
|
||||
|
||||
@tf.function
|
||||
def train_step(train_features, train_labels):
|
||||
def step_fn(train_features, train_labels):
|
||||
inputs = {'attention_mask': train_features['input_mask'], 'training': True}
|
||||
|
||||
if args['model_type'] != "distilbert":
|
||||
inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||
|
||||
with tf.GradientTape() as tape:
|
||||
logits = model(train_features['input_ids'], **inputs)[0]
|
||||
logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||
active_loss = tf.reshape(train_features['input_mask'], (-1,))
|
||||
active_logits = tf.boolean_mask(logits, active_loss)
|
||||
train_labels = tf.reshape(train_labels, (-1,))
|
||||
active_labels = tf.boolean_mask(train_labels, active_loss)
|
||||
cross_entropy = loss_fct(active_labels, active_logits)
|
||||
loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
|
||||
grads = tape.gradient(loss, model.trainable_variables)
|
||||
|
||||
gradient_accumulator(grads)
|
||||
|
||||
return cross_entropy
|
||||
|
||||
per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
|
||||
mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
|
||||
|
||||
return mean_loss
|
||||
|
||||
current_time = datetime.datetime.now()
|
||||
train_iterator = master_bar(range(args['num_train_epochs']))
|
||||
global_step = 0
|
||||
logging_loss = 0.0
|
||||
|
||||
for epoch in train_iterator:
|
||||
epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
|
||||
step = 1
|
||||
|
||||
with strategy.scope():
|
||||
for train_features, train_labels in epoch_iterator:
|
||||
loss = train_step(train_features, train_labels)
|
||||
|
||||
if step % args['gradient_accumulation_steps'] == 0:
|
||||
strategy.experimental_run_v2(apply_gradients)
|
||||
|
||||
loss_metric(loss)
|
||||
|
||||
global_step += 1
|
||||
|
||||
if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
|
||||
# Log metrics
|
||||
if args['n_device'] == 1 and args['evaluate_during_training']: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
logging.info("Eval at step " + str(global_step) + "\n" + report)
|
||||
logging.info("eval_loss: " + str(eval_loss))
|
||||
|
||||
precision = metrics.precision_score(y_true, y_pred)
|
||||
recall = metrics.recall_score(y_true, y_pred)
|
||||
f1 = metrics.f1_score(y_true, y_pred)
|
||||
|
||||
with writer.as_default():
|
||||
tf.summary.scalar("eval_loss", eval_loss, global_step)
|
||||
tf.summary.scalar("precision", precision, global_step)
|
||||
tf.summary.scalar("recall", recall, global_step)
|
||||
tf.summary.scalar("f1", f1, global_step)
|
||||
|
||||
lr = optimizer.learning_rate
|
||||
learning_rate = lr(step)
|
||||
|
||||
with writer.as_default():
|
||||
tf.summary.scalar("lr", learning_rate, global_step)
|
||||
tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
|
||||
|
||||
logging_loss = loss_metric.result()
|
||||
|
||||
with writer.as_default():
|
||||
tf.summary.scalar("loss", loss_metric.result(), step=step)
|
||||
|
||||
if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
model.save_pretrained(output_dir)
|
||||
logging.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
train_iterator.child.comment = f'loss : {loss_metric.result()}'
|
||||
step += 1
|
||||
|
||||
train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
|
||||
|
||||
loss_metric.reset_states()
|
||||
|
||||
logging.info(" Training took time = {}".format(datetime.datetime.now() - current_time))
|
||||
|
||||
|
||||
def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
|
||||
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||
eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
|
||||
eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
|
||||
preds = None
|
||||
num_eval_steps = math.ceil(size / eval_batch_size)
|
||||
master = master_bar(range(1))
|
||||
eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
|
||||
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||
loss = 0.0
|
||||
|
||||
logging.info("***** Running evaluation *****")
|
||||
logging.info(" Num examples = %d", size)
|
||||
logging.info(" Batch size = %d", eval_batch_size)
|
||||
|
||||
for eval_features, eval_labels in eval_iterator:
|
||||
inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
|
||||
|
||||
if args['model_type'] != "distilbert":
|
||||
inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||
|
||||
with strategy.scope():
|
||||
logits = model(eval_features['input_ids'], **inputs)[0]
|
||||
tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||
active_loss = tf.reshape(eval_features['input_mask'], (-1,))
|
||||
active_logits = tf.boolean_mask(tmp_logits, active_loss)
|
||||
tmp_eval_labels = tf.reshape(eval_labels, (-1,))
|
||||
active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
|
||||
cross_entropy = loss_fct(active_labels, active_logits)
|
||||
loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
|
||||
|
||||
if preds is None:
|
||||
preds = logits.numpy()
|
||||
label_ids = eval_labels.numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.numpy(), axis=0)
|
||||
label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
|
||||
|
||||
preds = np.argmax(preds, axis=2)
|
||||
y_pred = [[] for _ in range(label_ids.shape[0])]
|
||||
y_true = [[] for _ in range(label_ids.shape[0])]
|
||||
loss = loss / num_eval_steps
|
||||
|
||||
for i in range(label_ids.shape[0]):
|
||||
for j in range(label_ids.shape[1]):
|
||||
if label_ids[i, j] != pad_token_label_id:
|
||||
y_pred[i].append(labels[preds[i, j] - 1])
|
||||
y_true[i].append(labels[label_ids[i, j] - 1])
|
||||
|
||||
return y_true, y_pred, loss.numpy()
|
||||
|
||||
|
||||
def load_cache(cached_file, max_seq_length):
|
||||
name_to_features = {
|
||||
"input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||
"input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||
"segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||
"label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||
}
|
||||
|
||||
def _decode_record(record):
|
||||
example = tf.io.parse_single_example(record, name_to_features)
|
||||
features = {}
|
||||
features['input_ids'] = example['input_ids']
|
||||
features['input_mask'] = example['input_mask']
|
||||
features['segment_ids'] = example['segment_ids']
|
||||
|
||||
return features, example['label_ids']
|
||||
|
||||
d = tf.data.TFRecordDataset(cached_file)
|
||||
d = d.map(_decode_record, num_parallel_calls=4)
|
||||
count = d.reduce(0, lambda x, _: x + 1)
|
||||
|
||||
return d, count.numpy()
|
||||
|
||||
|
||||
def save_cache(features, cached_features_file):
|
||||
writer = tf.io.TFRecordWriter(cached_features_file)
|
||||
|
||||
for (ex_index, feature) in enumerate(features):
|
||||
if ex_index % 5000 == 0:
|
||||
logging.info("Writing example %d of %d" % (ex_index, len(features)))
|
||||
|
||||
def create_int_feature(values):
|
||||
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
||||
return f
|
||||
|
||||
record_feature = collections.OrderedDict()
|
||||
record_feature["input_ids"] = create_int_feature(feature.input_ids)
|
||||
record_feature["input_mask"] = create_int_feature(feature.input_mask)
|
||||
record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
|
||||
record_feature["label_ids"] = create_int_feature(feature.label_ids)
|
||||
|
||||
tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
|
||||
|
||||
writer.write(tf_example.SerializeToString())
|
||||
|
||||
writer.close()
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
|
||||
drop_remainder = True if args['tpu'] or mode == 'train' else False
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
|
||||
list(filter(None, args['model_name_or_path'].split("/"))).pop(),
|
||||
str(args['max_seq_length'])))
|
||||
if os.path.exists(cached_features_file) and not args['overwrite_cache']:
|
||||
logging.info("Loading features from cached file %s", cached_features_file)
|
||||
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||
else:
|
||||
logging.info("Creating features from dataset file at %s", args['data_dir'])
|
||||
examples = read_examples_from_file(args['data_dir'], mode)
|
||||
features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
|
||||
cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
|
||||
# xlnet has a cls token at the end
|
||||
cls_token=tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=bool(args['model_type'] in ["roberta"]),
|
||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(args['model_type'] in ["xlnet"]),
|
||||
# pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
|
||||
pad_token_label_id=pad_token_label_id
|
||||
)
|
||||
logging.info("Saving features into cached file %s", cached_features_file)
|
||||
save_cache(features, cached_features_file)
|
||||
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||
|
||||
if mode == 'train':
|
||||
dataset = dataset.repeat()
|
||||
dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
|
||||
|
||||
dataset = dataset.batch(batch_size, drop_remainder)
|
||||
dataset = dataset.prefetch(buffer_size=batch_size)
|
||||
|
||||
return dataset, size
|
||||
|
||||
|
||||
def main(_):
|
||||
logging.set_verbosity(logging.INFO)
|
||||
args = flags.FLAGS.flag_values_dict()
|
||||
|
||||
if os.path.exists(args['output_dir']) and os.listdir(
|
||||
args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args['output_dir']))
|
||||
|
||||
if args['fp16']:
|
||||
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
|
||||
|
||||
if args['tpu']:
|
||||
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
|
||||
tf.config.experimental_connect_to_cluster(resolver)
|
||||
tf.tpu.experimental.initialize_tpu_system(resolver)
|
||||
strategy = tf.distribute.experimental.TPUStrategy(resolver)
|
||||
args['n_device'] = args['num_tpu_cores']
|
||||
elif len(args['gpus'].split(',')) > 1:
|
||||
args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||
strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||
elif args['no_cuda']:
|
||||
args['n_device'] = 1
|
||||
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
|
||||
else:
|
||||
args['n_device'] = len(args['gpus'].split(','))
|
||||
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
|
||||
|
||||
logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
|
||||
args['n_device'], bool(args['n_device'] > 1), args['fp16'])
|
||||
|
||||
labels = get_labels(args['labels'])
|
||||
num_labels = len(labels) + 1
|
||||
pad_token_label_id = 0
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
|
||||
config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
|
||||
num_labels=num_labels,
|
||||
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||
|
||||
logging.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Training
|
||||
if args['do_train']:
|
||||
tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
|
||||
do_lower_case=args['do_lower_case'],
|
||||
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||
|
||||
with strategy.scope():
|
||||
model = model_class.from_pretrained(args['model_name_or_path'],
|
||||
from_pt=bool(".bin" in args['model_name_or_path']),
|
||||
config=config,
|
||||
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||
model.layers[-1].activation = tf.keras.activations.softmax
|
||||
|
||||
train_batch_size = args['per_device_train_batch_size'] * args['n_device']
|
||||
train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
|
||||
train_dataset = strategy.experimental_distribute_dataset(train_dataset)
|
||||
train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
|
||||
|
||||
if not os.path.exists(args['output_dir']):
|
||||
os.makedirs(args['output_dir'])
|
||||
|
||||
logging.info("Saving model to %s", args['output_dir'])
|
||||
|
||||
model.save_pretrained(args['output_dir'])
|
||||
tokenizer.save_pretrained(args['output_dir'])
|
||||
|
||||
# Evaluation
|
||||
if args['do_eval']:
|
||||
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||
checkpoints = []
|
||||
results = []
|
||||
|
||||
if args['eval_all_checkpoints']:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
|
||||
|
||||
logging.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
|
||||
if len(checkpoints) == 0:
|
||||
checkpoints.append(args['output_dir'])
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
|
||||
|
||||
with strategy.scope():
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
|
||||
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
if global_step:
|
||||
results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
|
||||
|
||||
output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
|
||||
|
||||
with tf.io.gfile.GFile(output_eval_file, "w") as writer:
|
||||
for res in results:
|
||||
for key, val in res.items():
|
||||
if "loss" in key:
|
||||
logging.info(key + " = " + str(val))
|
||||
writer.write(key + " = " + str(val))
|
||||
writer.write("\n")
|
||||
else:
|
||||
logging.info(key)
|
||||
logging.info("\n" + report)
|
||||
writer.write(key + "\n")
|
||||
writer.write(report)
|
||||
writer.write("\n")
|
||||
|
||||
if args['do_predict']:
|
||||
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||
model = model_class.from_pretrained(args['output_dir'])
|
||||
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||
predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
|
||||
y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
|
||||
output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
|
||||
output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
logging.info("\n" + report)
|
||||
|
||||
writer.write(report)
|
||||
writer.write("\n\nloss = " + str(pred_loss))
|
||||
|
||||
with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
|
||||
with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
|
||||
example_id = 0
|
||||
|
||||
for line in f:
|
||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||
writer.write(line)
|
||||
|
||||
if not y_pred[example_id]:
|
||||
example_id += 1
|
||||
elif y_pred[example_id]:
|
||||
output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
|
||||
writer.write(output_line)
|
||||
else:
|
||||
logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
flags.mark_flag_as_required("data_dir")
|
||||
flags.mark_flag_as_required("output_dir")
|
||||
flags.mark_flag_as_required("model_name_or_path")
|
||||
flags.mark_flag_as_required("model_type")
|
||||
app.run(main)
|
||||
61
examples/summarization/README.md
Normal file
61
examples/summarization/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Text Summarization with Pretrained Encoders
|
||||
|
||||
This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
|
||||
|
||||
The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
|
||||
|
||||
The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
|
||||
|
||||
## Setup
|
||||
|
||||
```
|
||||
git clone https://github.com/huggingface/transformers && cd transformers
|
||||
pip install [--editable] .
|
||||
pip install nltk py-rouge
|
||||
cd examples/summarization
|
||||
```
|
||||
|
||||
## Reproduce the authors' results on ROUGE
|
||||
|
||||
To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
|
||||
|
||||
```bash
|
||||
tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
|
||||
```
|
||||
|
||||
And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
|
||||
|
||||
```bash
|
||||
python run_summarization.py \
|
||||
--documents_dir $DATA_PATH \
|
||||
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
||||
--no_cuda false \
|
||||
--batch_size 4 \
|
||||
--min_length 50 \
|
||||
--max_length 200 \
|
||||
--beam_size 5 \
|
||||
--alpha 0.95 \
|
||||
--block_trigram true \
|
||||
--compute_rouge true
|
||||
```
|
||||
|
||||
The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
|
||||
|
||||
## Summarize any text
|
||||
|
||||
Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
|
||||
|
||||
```bash
|
||||
python run_summarization.py \
|
||||
--documents_dir $DATA_PATH \
|
||||
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
||||
--no_cuda false \
|
||||
--batch_size 4 \
|
||||
--min_length 50 \
|
||||
--max_length 200 \
|
||||
--beam_size 5 \
|
||||
--alpha 0.95 \
|
||||
--block_trigram true \
|
||||
```
|
||||
|
||||
You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
|
||||
119
examples/summarization/configuration_bertabs.py
Normal file
119
examples/summarization/configuration_bertabs.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019 The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" BertAbs configuration """
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
BERTABS_FINETUNED_CONFIG_MAP = {
|
||||
"bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json",
|
||||
}
|
||||
|
||||
|
||||
class BertAbsConfig(PretrainedConfig):
|
||||
r""" Class to store the configuration of the BertAbs model.
|
||||
|
||||
Arguments:
|
||||
max_pos: int
|
||||
The maximum sequence length that this model will be used with.
|
||||
enc_layer: int
|
||||
The numner of hidden layers in the Transformer encoder.
|
||||
enc_hidden_size: int
|
||||
The size of the encoder's layers.
|
||||
enc_heads: int
|
||||
The number of attention heads for each attention layer in the encoder.
|
||||
enc_ff_size: int
|
||||
The size of the encoder's feed-forward layers.
|
||||
enc_dropout: int
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
embeddings, layers, pooler and also the attention probabilities in
|
||||
the encoder.
|
||||
dec_layer: int
|
||||
The numner of hidden layers in the decoder.
|
||||
dec_hidden_size: int
|
||||
The size of the decoder's layers.
|
||||
dec_heads: int
|
||||
The number of attention heads for each attention layer in the decoder.
|
||||
dec_ff_size: int
|
||||
The size of the decoder's feed-forward layers.
|
||||
dec_dropout: int
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
embeddings, layers, pooler and also the attention probabilities in
|
||||
the decoder.
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=30522,
|
||||
max_pos=512,
|
||||
enc_layers=6,
|
||||
enc_hidden_size=512,
|
||||
enc_heads=8,
|
||||
enc_ff_size=512,
|
||||
enc_dropout=0.2,
|
||||
dec_layers=6,
|
||||
dec_hidden_size=768,
|
||||
dec_heads=8,
|
||||
dec_ff_size=2048,
|
||||
dec_dropout=0.2,
|
||||
**kwargs,
|
||||
):
|
||||
super(BertAbsConfig, self).__init__(**kwargs)
|
||||
|
||||
if self._input_is_path_to_json(vocab_size_or_config_json_file):
|
||||
path_to_json = vocab_size_or_config_json_file
|
||||
with open(path_to_json, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.max_pos = max_pos
|
||||
|
||||
self.enc_layers = enc_layers
|
||||
self.enc_hidden_size = enc_hidden_size
|
||||
self.enc_heads = enc_heads
|
||||
self.enc_ff_size = enc_ff_size
|
||||
self.enc_dropout = enc_dropout
|
||||
|
||||
self.dec_layers = dec_layers
|
||||
self.dec_hidden_size = dec_hidden_size
|
||||
self.dec_heads = dec_heads
|
||||
self.dec_ff_size = dec_ff_size
|
||||
self.dec_dropout = dec_dropout
|
||||
else:
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
|
||||
def _input_is_path_to_json(self, first_argument):
|
||||
""" Checks whether the first argument passed to config
|
||||
is the path to a JSON file that contains the config.
|
||||
"""
|
||||
is_python_2 = sys.version_info[0] == 2
|
||||
if is_python_2:
|
||||
return isinstance(first_argument, unicode)
|
||||
else:
|
||||
return isinstance(first_argument, str)
|
||||
@@ -0,0 +1,163 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Convert BertExtAbs's checkpoints.
|
||||
|
||||
The script looks like it is doing something trivial but it is not. The "weights"
|
||||
proposed by the authors are actually the entire model pickled. We need to load
|
||||
the model within the original codebase to be able to only save its `state_dict`.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from collections import namedtuple
|
||||
import logging
|
||||
import torch
|
||||
|
||||
from models.model_builder import AbsSummarizer # The authors' implementation
|
||||
from model_bertabs import BertAbsSummarizer
|
||||
|
||||
from transformers import BertTokenizer
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SAMPLE_TEXT = 'Hello world! cécé herlolip'
|
||||
|
||||
|
||||
BertAbsConfig = namedtuple(
|
||||
"BertAbsConfig",
|
||||
["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
|
||||
)
|
||||
|
||||
|
||||
def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
|
||||
""" Copy/paste and tweak the pre-trained weights provided by the creators
|
||||
of BertAbs for the internal architecture.
|
||||
"""
|
||||
|
||||
# Instantiate the authors' model with the pre-trained weights
|
||||
config = BertAbsConfig(
|
||||
temp_dir=".",
|
||||
finetune_bert=False,
|
||||
large=False,
|
||||
share_emb=True,
|
||||
use_bert_emb=False,
|
||||
encoder="bert",
|
||||
max_pos=512,
|
||||
enc_layers=6,
|
||||
enc_hidden_size=512,
|
||||
enc_heads=8,
|
||||
enc_ff_size=512,
|
||||
enc_dropout=0.2,
|
||||
dec_layers=6,
|
||||
dec_hidden_size=768,
|
||||
dec_heads=8,
|
||||
dec_ff_size=2048,
|
||||
dec_dropout=0.2,
|
||||
)
|
||||
checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
|
||||
original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
|
||||
original.eval()
|
||||
|
||||
new_model = BertAbsSummarizer(config, torch.device("cpu"))
|
||||
new_model.eval()
|
||||
|
||||
# -------------------
|
||||
# Convert the weights
|
||||
# -------------------
|
||||
|
||||
logging.info("convert the model")
|
||||
new_model.bert.load_state_dict(original.bert.state_dict())
|
||||
new_model.decoder.load_state_dict(original.decoder.state_dict())
|
||||
new_model.generator.load_state_dict(original.generator.state_dict())
|
||||
|
||||
# ----------------------------------
|
||||
# Make sure the outpus are identical
|
||||
# ----------------------------------
|
||||
|
||||
logging.info("Make sure that the models' outputs are identical")
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
# prepare the model inputs
|
||||
encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
|
||||
encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
|
||||
encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
|
||||
decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
|
||||
decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
|
||||
decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
|
||||
|
||||
# failsafe to make sure the weights reset does not affect the
|
||||
# loaded weights.
|
||||
assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
|
||||
|
||||
# forward pass
|
||||
src = encoder_input_ids
|
||||
tgt = decoder_input_ids
|
||||
segs = token_type_ids = None
|
||||
clss = None
|
||||
mask_src = encoder_attention_mask = None
|
||||
mask_tgt = decoder_attention_mask = None
|
||||
mask_cls = None
|
||||
|
||||
# The original model does not apply the geneator layer immediatly but rather in
|
||||
# the beam search (where it combines softmax + linear layer). Since we already
|
||||
# apply the softmax in our generation process we only apply the linear layer here.
|
||||
# We make sure that the outputs of the full stack are identical
|
||||
output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
|
||||
output_original_generator = original.generator(output_original_model)
|
||||
|
||||
output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
|
||||
output_converted_generator = new_model.generator(output_converted_model)
|
||||
|
||||
maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
|
||||
print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
|
||||
maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
|
||||
print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
|
||||
|
||||
are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
|
||||
if are_identical:
|
||||
logging.info("all weights are equal up to 1e-3")
|
||||
else:
|
||||
raise ValueError("the weights are different. The new model is likely different from the original one.")
|
||||
|
||||
# The model has been saved with torch.save(model) and this is bound to the exact
|
||||
# directory structure. We save the state_dict instead.
|
||||
logging.info("saving the model's state dictionary")
|
||||
torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--bertabs_checkpoint_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path the official PyTorch dump.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the output PyTorch model.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
convert_bertabs_checkpoints(
|
||||
args.bertabs_checkpoint_path,
|
||||
args.pytorch_dump_folder_path,
|
||||
)
|
||||
1161
examples/summarization/modeling_bertabs.py
Normal file
1161
examples/summarization/modeling_bertabs.py
Normal file
File diff suppressed because it is too large
Load Diff
9
examples/summarization/requirements.txt
Normal file
9
examples/summarization/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
# progress bars in model download and training scripts
|
||||
tqdm
|
||||
# Accessing files from S3 directly.
|
||||
boto3
|
||||
# Used for downloading models over HTTP
|
||||
requests
|
||||
# For ROUGE
|
||||
nltk
|
||||
py-rouge
|
||||
344
examples/summarization/run_summarization.py
Normal file
344
examples/summarization/run_summarization.py
Normal file
@@ -0,0 +1,344 @@
|
||||
#! /usr/bin/python3
|
||||
import argparse
|
||||
from collections import namedtuple
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, SequentialSampler
|
||||
from tqdm import tqdm
|
||||
|
||||
from transformers import BertTokenizer
|
||||
|
||||
from modeling_bertabs import BertAbs, build_predictor
|
||||
|
||||
from utils_summarization import (
|
||||
SummarizationDataset,
|
||||
encode_for_summarization,
|
||||
build_mask,
|
||||
fit_to_block_size,
|
||||
compute_token_type_ids,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
||||
|
||||
Batch = namedtuple(
|
||||
"Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
|
||||
)
|
||||
|
||||
|
||||
def evaluate(args):
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
|
||||
model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
|
||||
model.to(args.device)
|
||||
model.eval()
|
||||
|
||||
symbols = {
|
||||
"BOS": tokenizer.vocab["[unused0]"],
|
||||
"EOS": tokenizer.vocab["[unused1]"],
|
||||
"PAD": tokenizer.vocab["[PAD]"],
|
||||
}
|
||||
|
||||
if args.compute_rouge:
|
||||
reference_summaries = []
|
||||
generated_summaries = []
|
||||
|
||||
import rouge
|
||||
import nltk
|
||||
nltk.download('punkt')
|
||||
rouge_evaluator = rouge.Rouge(
|
||||
metrics=['rouge-n', 'rouge-l'],
|
||||
max_n=2,
|
||||
limit_length=True,
|
||||
length_limit=args.beam_size,
|
||||
length_limit_type='words',
|
||||
apply_avg=True,
|
||||
apply_best=False,
|
||||
alpha=0.5, # Default F1_score
|
||||
weight_factor=1.2,
|
||||
stemming=True,
|
||||
)
|
||||
|
||||
# these (unused) arguments are defined to keep the compatibility
|
||||
# with the legacy code and will be deleted in a next iteration.
|
||||
args.result_path = ""
|
||||
args.temp_dir = ""
|
||||
|
||||
data_iterator = build_data_iterator(args, tokenizer)
|
||||
predictor = build_predictor(args, tokenizer, symbols, model)
|
||||
|
||||
logger.info("***** Running evaluation *****")
|
||||
logger.info(" Number examples = %d", len(data_iterator.dataset))
|
||||
logger.info(" Batch size = %d", args.batch_size)
|
||||
logger.info("")
|
||||
logger.info("***** Beam Search parameters *****")
|
||||
logger.info(" Beam size = %d", args.beam_size)
|
||||
logger.info(" Minimum length = %d", args.min_length)
|
||||
logger.info(" Maximum length = %d", args.max_length)
|
||||
logger.info(" Alpha (length penalty) = %.2f", args.alpha)
|
||||
logger.info(" Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
|
||||
|
||||
for batch in tqdm(data_iterator):
|
||||
batch_data = predictor.translate_batch(batch)
|
||||
translations = predictor.from_batch(batch_data)
|
||||
summaries = [format_summary(t) for t in translations]
|
||||
save_summaries(summaries, args.summaries_output_dir, batch.document_names)
|
||||
|
||||
if args.compute_rouge:
|
||||
reference_summaries += batch.tgt_str
|
||||
generated_summaries += summaries
|
||||
|
||||
if args.compute_rouge:
|
||||
scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
|
||||
str_scores = format_rouge_scores(scores)
|
||||
save_rouge_scores(str_scores)
|
||||
print(str_scores)
|
||||
|
||||
|
||||
def save_summaries(summaries, path, original_document_name):
|
||||
""" Write the summaries in fies that are prefixed by the original
|
||||
files' name with the `_summary` appended.
|
||||
|
||||
Attributes:
|
||||
original_document_names: List[string]
|
||||
Name of the document that was summarized.
|
||||
path: string
|
||||
Path were the summaries will be written
|
||||
summaries: List[string]
|
||||
The summaries that we produced.
|
||||
"""
|
||||
for summary, document_name in zip(summaries, original_document_name):
|
||||
# Prepare the summary file's name
|
||||
if "." in document_name:
|
||||
bare_document_name = ".".join(document_name.split(".")[:-1])
|
||||
extension = document_name.split(".")[-1]
|
||||
name = bare_document_name + "_summary." + extension
|
||||
else:
|
||||
name = document_name + "_summary"
|
||||
|
||||
file_path = os.path.join(path, name)
|
||||
with open(file_path, "w") as output:
|
||||
output.write(summary)
|
||||
|
||||
|
||||
def format_summary(translation):
|
||||
""" Transforms the output of the `from_batch` function
|
||||
into nicely formatted summaries.
|
||||
"""
|
||||
raw_summary, _, _ = translation
|
||||
summary = (
|
||||
raw_summary.replace("[unused0]", "")
|
||||
.replace("[unused3]", "")
|
||||
.replace("[PAD]", "")
|
||||
.replace("[unused1]", "")
|
||||
.replace(r" +", " ")
|
||||
.replace(" [unused2] ", ". ")
|
||||
.replace("[unused2]", "")
|
||||
.strip()
|
||||
)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def format_rouge_scores(scores):
|
||||
return """\n
|
||||
****** ROUGE SCORES ******
|
||||
|
||||
** ROUGE 1
|
||||
F1 >> {:.3f}
|
||||
Precision >> {:.3f}
|
||||
Recall >> {:.3f}
|
||||
|
||||
** ROUGE 2
|
||||
F1 >> {:.3f}
|
||||
Precision >> {:.3f}
|
||||
Recall >> {:.3f}
|
||||
|
||||
** ROUGE L
|
||||
F1 >> {:.3f}
|
||||
Precision >> {:.3f}
|
||||
Recall >> {:.3f}""".format(
|
||||
scores['rouge-1']['f'],
|
||||
scores['rouge-1']['p'],
|
||||
scores['rouge-1']['r'],
|
||||
scores['rouge-2']['f'],
|
||||
scores['rouge-2']['p'],
|
||||
scores['rouge-2']['r'],
|
||||
scores['rouge-l']['f'],
|
||||
scores['rouge-l']['p'],
|
||||
scores['rouge-l']['r'],
|
||||
)
|
||||
|
||||
|
||||
def save_rouge_scores(str_scores):
|
||||
with open("rouge_scores.txt", "w") as output:
|
||||
output.write(str_scores)
|
||||
|
||||
|
||||
#
|
||||
# LOAD the dataset
|
||||
#
|
||||
|
||||
|
||||
def build_data_iterator(args, tokenizer):
|
||||
dataset = load_and_cache_examples(args, tokenizer)
|
||||
sampler = SequentialSampler(dataset)
|
||||
collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
|
||||
iterator = DataLoader(
|
||||
dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
|
||||
)
|
||||
|
||||
return iterator
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer):
|
||||
dataset = SummarizationDataset(args.documents_dir)
|
||||
return dataset
|
||||
|
||||
|
||||
def collate(data, tokenizer, block_size, device):
|
||||
""" Collate formats the data passed to the data loader.
|
||||
|
||||
In particular we tokenize the data batch after batch to avoid keeping them
|
||||
all in memory. We output the data as a namedtuple to fit the original BertAbs's
|
||||
API.
|
||||
"""
|
||||
data = [x for x in data if not len(x[1]) == 0] # remove empty_files
|
||||
names = [name for name, _, _ in data]
|
||||
summaries = [" ".join(summary_list) for _, _, summary_list in data]
|
||||
|
||||
encoded_text = [
|
||||
encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
|
||||
]
|
||||
encoded_stories = torch.tensor(
|
||||
[
|
||||
fit_to_block_size(story, block_size, tokenizer.pad_token_id)
|
||||
for story, _ in encoded_text
|
||||
]
|
||||
)
|
||||
encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
|
||||
encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
|
||||
|
||||
batch = Batch(
|
||||
document_names=names,
|
||||
batch_size=len(encoded_stories),
|
||||
src=encoded_stories.to(device),
|
||||
segs=encoder_token_type_ids.to(device),
|
||||
mask_src=encoder_mask.to(device),
|
||||
tgt_str=summaries,
|
||||
)
|
||||
|
||||
return batch
|
||||
|
||||
|
||||
def decode_summary(summary_tokens, tokenizer):
|
||||
""" Decode the summary and return it in a format
|
||||
suitable for evaluation.
|
||||
"""
|
||||
summary_tokens = summary_tokens.to("cpu").numpy()
|
||||
summary = tokenizer.decode(summary_tokens)
|
||||
sentences = summary.split(".")
|
||||
sentences = [s + "." for s in sentences]
|
||||
return sentences
|
||||
|
||||
|
||||
def main():
|
||||
""" The main function defines the interface with the users.
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--documents_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The folder where the documents to summarize are located.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summaries_output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=False,
|
||||
help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compute_rouge",
|
||||
default=False,
|
||||
type=bool,
|
||||
required=False,
|
||||
help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
|
||||
)
|
||||
# EVALUATION options
|
||||
parser.add_argument(
|
||||
"--no_cuda",
|
||||
default=False,
|
||||
type=bool,
|
||||
help="Whether to force the execution on CPU.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
|
||||
)
|
||||
# BEAM SEARCH arguments
|
||||
parser.add_argument(
|
||||
"--min_length",
|
||||
default=50,
|
||||
type=int,
|
||||
help="Minimum number of tokens for the summaries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_length",
|
||||
default=200,
|
||||
type=int,
|
||||
help="Maixmum number of tokens for the summaries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--beam_size",
|
||||
default=5,
|
||||
type=int,
|
||||
help="The number of beams to start with for each example.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alpha",
|
||||
default=0.95,
|
||||
type=float,
|
||||
help="The value of alpha for the length penalty in the beam search.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block_trigram",
|
||||
default=True,
|
||||
type=bool,
|
||||
help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Select device (distibuted not available)
|
||||
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
|
||||
# Check the existence of directories
|
||||
if not args.summaries_output_dir:
|
||||
args.summaries_output_dir = args.documents_dir
|
||||
|
||||
if not documents_dir_is_valid(args.documents_dir):
|
||||
raise FileNotFoundError(
|
||||
"We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
|
||||
)
|
||||
os.makedirs(args.summaries_output_dir, exist_ok=True)
|
||||
|
||||
evaluate(args)
|
||||
|
||||
|
||||
def documents_dir_is_valid(path):
|
||||
if not os.path.exists(path):
|
||||
return False
|
||||
|
||||
file_list = os.listdir(path)
|
||||
if len(file_list) == 0:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -10,9 +10,14 @@ from torch.utils.data import Dataset
|
||||
# ------------
|
||||
|
||||
|
||||
class CNNDailyMailDataset(Dataset):
|
||||
class SummarizationDataset(Dataset):
|
||||
""" Abstracts the dataset used to train seq2seq models.
|
||||
|
||||
The class will process the documents that are located in the specified
|
||||
folder. The preprocessing will work on any document that is reasonably
|
||||
formatted. On the CNN/DailyMail dataset it will extract both the story
|
||||
and the summary.
|
||||
|
||||
CNN/Daily News:
|
||||
|
||||
The CNN/Daily News raw datasets are downloaded from [1]. The stories are
|
||||
@@ -25,33 +30,33 @@ class CNNDailyMailDataset(Dataset):
|
||||
[2] https://github.com/abisee/cnn-dailymail/
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer, prefix="train", data_dir=""):
|
||||
assert os.path.isdir(data_dir)
|
||||
self.tokenizer = tokenizer
|
||||
def __init__(self, path="", prefix="train"):
|
||||
""" We initialize the class by listing all the documents to summarize.
|
||||
Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
|
||||
"""
|
||||
assert os.path.isdir(path)
|
||||
|
||||
# We initialize the class by listing all the files that contain
|
||||
# stories and summaries. Files are not read in memory given
|
||||
# the size of the corpus.
|
||||
self.stories_path = []
|
||||
datasets = ("cnn", "dailymail")
|
||||
for dataset in datasets:
|
||||
path_to_stories = os.path.join(data_dir, dataset, "stories")
|
||||
story_filenames_list = os.listdir(path_to_stories)
|
||||
for story_filename in story_filenames_list:
|
||||
path_to_story = os.path.join(path_to_stories, story_filename)
|
||||
if not os.path.isfile(path_to_story):
|
||||
continue
|
||||
self.stories_path.append(path_to_story)
|
||||
self.documents = []
|
||||
story_filenames_list = os.listdir(path)
|
||||
for story_filename in story_filenames_list:
|
||||
if "summary" in story_filename:
|
||||
continue
|
||||
path_to_story = os.path.join(path, story_filename)
|
||||
if not os.path.isfile(path_to_story):
|
||||
continue
|
||||
self.documents.append(path_to_story)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.stories_path)
|
||||
""" Returns the number of documents. """
|
||||
return len(self.documents)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
story_path = self.stories_path[idx]
|
||||
with open(story_path, encoding="utf-8") as source:
|
||||
document_path = self.documents[idx]
|
||||
document_name = document_path.split("/")[-1]
|
||||
with open(document_path, encoding="utf-8") as source:
|
||||
raw_story = source.read()
|
||||
story_lines, summary_lines = process_story(raw_story)
|
||||
return story_lines, summary_lines
|
||||
return document_name, story_lines, summary_lines
|
||||
|
||||
|
||||
def process_story(raw_story):
|
||||
@@ -81,7 +86,7 @@ def process_story(raw_story):
|
||||
story_lines.append(element)
|
||||
except IndexError:
|
||||
# if "@highlight" is absent from the file we pop
|
||||
# all elements until there is None.
|
||||
# all elements until there is None, raising an exception.
|
||||
return story_lines, []
|
||||
|
||||
# gather summary lines
|
||||
@@ -104,31 +109,22 @@ def _add_missing_period(line):
|
||||
# --------------------------
|
||||
|
||||
|
||||
def fit_to_block_size(sequence, block_size, pad_token):
|
||||
def fit_to_block_size(sequence, block_size, pad_token_id):
|
||||
""" Adapt the source and target sequences' lengths to the block size.
|
||||
If the sequence is shorter than the block size we pad it with -1 ids
|
||||
which correspond to padding tokens.
|
||||
If the sequence is shorter we append padding token to the right of the sequence.
|
||||
"""
|
||||
if len(sequence) > block_size:
|
||||
return sequence[:block_size]
|
||||
else:
|
||||
sequence.extend([pad_token] * (block_size - len(sequence)))
|
||||
sequence.extend([pad_token_id] * (block_size - len(sequence)))
|
||||
return sequence
|
||||
|
||||
|
||||
def build_lm_labels(sequence, pad_token):
|
||||
""" Padding token, encoded as 0, are represented by the value -1 so they
|
||||
are not taken into account in the loss computation. """
|
||||
padded = sequence.clone()
|
||||
padded[padded == pad_token] = -1
|
||||
return padded
|
||||
|
||||
|
||||
def build_mask(sequence, pad_token):
|
||||
def build_mask(sequence, pad_token_id):
|
||||
""" Builds the mask. The attention mechanism will only attend to positions
|
||||
with value 1. """
|
||||
mask = torch.ones_like(sequence)
|
||||
idx_pad_tokens = sequence == pad_token
|
||||
idx_pad_tokens = sequence == pad_token_id
|
||||
mask[idx_pad_tokens] = 0
|
||||
return mask
|
||||
|
||||
@@ -138,18 +134,11 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
|
||||
as specified in [1] by using `[SEP] [CLS]` tokens to separate
|
||||
sentences.
|
||||
"""
|
||||
story_lines_token_ids = [
|
||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
||||
for line in story_lines
|
||||
]
|
||||
summary_lines_token_ids = [
|
||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
||||
for line in summary_lines
|
||||
]
|
||||
|
||||
story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
|
||||
story_token_ids = [
|
||||
token for sentence in story_lines_token_ids for token in sentence
|
||||
]
|
||||
summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
|
||||
summary_token_ids = [
|
||||
token for sentence in summary_lines_token_ids for token in sentence
|
||||
]
|
||||
@@ -174,7 +163,7 @@ def compute_token_type_ids(batch, separator_token_id):
|
||||
"""
|
||||
batch_embeddings = []
|
||||
for sequence in batch:
|
||||
sentence_num = 0
|
||||
sentence_num = -1
|
||||
embeddings = []
|
||||
for s in sequence:
|
||||
if s == separator_token_id:
|
||||
@@ -21,7 +21,6 @@ from utils_summarization import (
|
||||
compute_token_type_ids,
|
||||
fit_to_block_size,
|
||||
build_mask,
|
||||
build_lm_labels,
|
||||
process_story,
|
||||
)
|
||||
|
||||
@@ -88,20 +87,6 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
||||
expected_summary_lines = ["It was the best of times."]
|
||||
self.assertEqual(expected_summary_lines, summary_lines)
|
||||
|
||||
def test_build_lm_labels_no_padding(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4])
|
||||
expected = sequence
|
||||
np.testing.assert_array_equal(
|
||||
build_lm_labels(sequence, 0).numpy(), expected.numpy()
|
||||
)
|
||||
|
||||
def test_build_lm_labels(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
|
||||
expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
|
||||
np.testing.assert_array_equal(
|
||||
build_lm_labels(sequence, 0).numpy(), expected.numpy()
|
||||
)
|
||||
|
||||
def test_build_mask_no_padding(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4])
|
||||
expected = torch.tensor([1, 1, 1, 1])
|
||||
@@ -125,7 +110,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
||||
[[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
|
||||
)
|
||||
expected = torch.tensor(
|
||||
[[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]]
|
||||
[[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
|
||||
)
|
||||
|
||||
result = compute_token_type_ids(batch, separator)
|
||||
@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase):
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
testargs = ["run_squad.py",
|
||||
"--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
|
||||
"--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
|
||||
"--data_dir=./examples/tests_samples/SQUAD",
|
||||
"--model_name=bert-base-uncased",
|
||||
"--output_dir=./examples/tests_samples/temp_dir",
|
||||
"--max_steps=10",
|
||||
|
||||
140
examples/tests_samples/SQUAD/train-v2.0.json
Normal file
140
examples/tests_samples/SQUAD/train-v2.0.json
Normal file
@@ -0,0 +1,140 @@
|
||||
{
|
||||
"version": "v2.0",
|
||||
"data": [{
|
||||
"title": "Normans",
|
||||
"paragraphs": [{
|
||||
"qas": [{
|
||||
"question": "In what country is Normandy located?",
|
||||
"id": "56ddde6b9a695914005b9628",
|
||||
"answers": [{
|
||||
"text": "France",
|
||||
"answer_start": 159
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"question": "When were the Normans in Normandy?",
|
||||
"id": "56ddde6b9a695914005b9629",
|
||||
"answers": [{
|
||||
"text": "10th and 11th centuries",
|
||||
"answer_start": 94
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"question": "From which countries did the Norse originate?",
|
||||
"id": "56ddde6b9a695914005b962a",
|
||||
"answers": [{
|
||||
"text": "Denmark, Iceland and Norway",
|
||||
"answer_start": 256
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"plausible_answers": [{
|
||||
"text": "Rollo",
|
||||
"answer_start": 308
|
||||
}],
|
||||
"question": "Who did King Charles III swear fealty to?",
|
||||
"id": "5ad39d53604f3c001a3fe8d3",
|
||||
"answers": [],
|
||||
"is_impossible": true
|
||||
}, {
|
||||
"plausible_answers": [{
|
||||
"text": "10th century",
|
||||
"answer_start": 671
|
||||
}],
|
||||
"question": "When did the Frankish identity emerge?",
|
||||
"id": "5ad39d53604f3c001a3fe8d4",
|
||||
"answers": [],
|
||||
"is_impossible": true
|
||||
}],
|
||||
"context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
|
||||
}, {
|
||||
"qas": [{
|
||||
"question": "Who was the duke in the battle of Hastings?",
|
||||
"id": "56dddf4066d3e219004dad5f",
|
||||
"answers": [{
|
||||
"text": "William the Conqueror",
|
||||
"answer_start": 1022
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"plausible_answers": [{
|
||||
"text": "Antioch",
|
||||
"answer_start": 1295
|
||||
}],
|
||||
"question": "What principality did William the conquerer found?",
|
||||
"id": "5ad3a266604f3c001a3fea2b",
|
||||
"answers": [],
|
||||
"is_impossible": true
|
||||
}],
|
||||
"context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
|
||||
}]
|
||||
}, {
|
||||
"title": "Computational_complexity_theory",
|
||||
"paragraphs": [{
|
||||
"qas": [{
|
||||
"question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
|
||||
"id": "56e16182e3433e1400422e28",
|
||||
"answers": [{
|
||||
"text": "Computational complexity theory",
|
||||
"answer_start": 0
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"plausible_answers": [{
|
||||
"text": "algorithm",
|
||||
"answer_start": 472
|
||||
}],
|
||||
"question": "What is a manual application of mathematical steps?",
|
||||
"id": "5ad5316b5b96ef001a10ab76",
|
||||
"answers": [],
|
||||
"is_impossible": true
|
||||
}],
|
||||
"context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
|
||||
}, {
|
||||
"qas": [{
|
||||
"question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
|
||||
"id": "56e16839cd28a01900c67887",
|
||||
"answers": [{
|
||||
"text": "if its solution requires significant resources",
|
||||
"answer_start": 46
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
|
||||
"id": "56e16839cd28a01900c67888",
|
||||
"answers": [{
|
||||
"text": "mathematical models of computation",
|
||||
"answer_start": 176
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"question": "What are two basic primary resources used to guage complexity?",
|
||||
"id": "56e16839cd28a01900c67889",
|
||||
"answers": [{
|
||||
"text": "time and storage",
|
||||
"answer_start": 305
|
||||
}],
|
||||
"is_impossible": false
|
||||
}, {
|
||||
"plausible_answers": [{
|
||||
"text": "the number of gates in a circuit",
|
||||
"answer_start": 436
|
||||
}],
|
||||
"question": "What unit is measured to determine circuit simplicity?",
|
||||
"id": "5ad532575b96ef001a10ab7f",
|
||||
"answers": [],
|
||||
"is_impossible": true
|
||||
}, {
|
||||
"plausible_answers": [{
|
||||
"text": "the number of processors",
|
||||
"answer_start": 502
|
||||
}],
|
||||
"question": "What number is used in perpendicular computing?",
|
||||
"id": "5ad532575b96ef001a10ab80",
|
||||
"answers": [],
|
||||
"is_impossible": true
|
||||
}],
|
||||
"context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
|
||||
}]
|
||||
}]
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,330 +0,0 @@
|
||||
""" Official evaluation script for SQuAD version 2.0.
|
||||
Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
|
||||
|
||||
In addition to basic functionality, we also compute additional statistics and
|
||||
plot precision-recall curves if an additional na_prob.json file is provided.
|
||||
This file is expected to map question ID's to the model's predicted probability
|
||||
that a question is unanswerable.
|
||||
"""
|
||||
import argparse
|
||||
import collections
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
class EVAL_OPTS():
|
||||
def __init__(self, data_file, pred_file, out_file="",
|
||||
na_prob_file="na_prob.json", na_prob_thresh=1.0,
|
||||
out_image_dir=None, verbose=False):
|
||||
self.data_file = data_file
|
||||
self.pred_file = pred_file
|
||||
self.out_file = out_file
|
||||
self.na_prob_file = na_prob_file
|
||||
self.na_prob_thresh = na_prob_thresh
|
||||
self.out_image_dir = out_image_dir
|
||||
self.verbose = verbose
|
||||
|
||||
OPTS = None
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
|
||||
parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
|
||||
parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
|
||||
parser.add_argument('--out-file', '-o', metavar='eval.json',
|
||||
help='Write accuracy metrics to file (default is stdout).')
|
||||
parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
|
||||
help='Model estimates of probability of no answer.')
|
||||
parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
|
||||
help='Predict "" if no-answer probability exceeds this (default = 1.0).')
|
||||
parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
|
||||
help='Save precision-recall curves to directory.')
|
||||
parser.add_argument('--verbose', '-v', action='store_true')
|
||||
if len(sys.argv) == 1:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
return parser.parse_args()
|
||||
|
||||
def make_qid_to_has_ans(dataset):
|
||||
qid_to_has_ans = {}
|
||||
for article in dataset:
|
||||
for p in article['paragraphs']:
|
||||
for qa in p['qas']:
|
||||
qid_to_has_ans[qa['id']] = bool(qa['answers'])
|
||||
return qid_to_has_ans
|
||||
|
||||
def normalize_answer(s):
|
||||
"""Lower text and remove punctuation, articles and extra whitespace."""
|
||||
def remove_articles(text):
|
||||
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
|
||||
return re.sub(regex, ' ', text)
|
||||
def white_space_fix(text):
|
||||
return ' '.join(text.split())
|
||||
def remove_punc(text):
|
||||
exclude = set(string.punctuation)
|
||||
return ''.join(ch for ch in text if ch not in exclude)
|
||||
def lower(text):
|
||||
return text.lower()
|
||||
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
||||
|
||||
def get_tokens(s):
|
||||
if not s: return []
|
||||
return normalize_answer(s).split()
|
||||
|
||||
def compute_exact(a_gold, a_pred):
|
||||
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
|
||||
|
||||
def compute_f1(a_gold, a_pred):
|
||||
gold_toks = get_tokens(a_gold)
|
||||
pred_toks = get_tokens(a_pred)
|
||||
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
|
||||
num_same = sum(common.values())
|
||||
if len(gold_toks) == 0 or len(pred_toks) == 0:
|
||||
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
|
||||
return int(gold_toks == pred_toks)
|
||||
if num_same == 0:
|
||||
return 0
|
||||
precision = 1.0 * num_same / len(pred_toks)
|
||||
recall = 1.0 * num_same / len(gold_toks)
|
||||
f1 = (2 * precision * recall) / (precision + recall)
|
||||
return f1
|
||||
|
||||
def get_raw_scores(dataset, preds):
|
||||
exact_scores = {}
|
||||
f1_scores = {}
|
||||
for article in dataset:
|
||||
for p in article['paragraphs']:
|
||||
for qa in p['qas']:
|
||||
qid = qa['id']
|
||||
gold_answers = [a['text'] for a in qa['answers']
|
||||
if normalize_answer(a['text'])]
|
||||
if not gold_answers:
|
||||
# For unanswerable questions, only correct answer is empty string
|
||||
gold_answers = ['']
|
||||
if qid not in preds:
|
||||
print('Missing prediction for %s' % qid)
|
||||
continue
|
||||
a_pred = preds[qid]
|
||||
# Take max over all gold answers
|
||||
exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
|
||||
f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
|
||||
return exact_scores, f1_scores
|
||||
|
||||
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
|
||||
new_scores = {}
|
||||
for qid, s in scores.items():
|
||||
pred_na = na_probs[qid] > na_prob_thresh
|
||||
if pred_na:
|
||||
new_scores[qid] = float(not qid_to_has_ans[qid])
|
||||
else:
|
||||
new_scores[qid] = s
|
||||
return new_scores
|
||||
|
||||
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
|
||||
if not qid_list:
|
||||
total = len(exact_scores)
|
||||
return collections.OrderedDict([
|
||||
('exact', 100.0 * sum(exact_scores.values()) / total),
|
||||
('f1', 100.0 * sum(f1_scores.values()) / total),
|
||||
('total', total),
|
||||
])
|
||||
else:
|
||||
total = len(qid_list)
|
||||
return collections.OrderedDict([
|
||||
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
|
||||
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
|
||||
('total', total),
|
||||
])
|
||||
|
||||
def merge_eval(main_eval, new_eval, prefix):
|
||||
for k in new_eval:
|
||||
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
|
||||
|
||||
def plot_pr_curve(precisions, recalls, out_image, title):
|
||||
plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
|
||||
plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
|
||||
plt.xlabel('Recall')
|
||||
plt.ylabel('Precision')
|
||||
plt.xlim([0.0, 1.05])
|
||||
plt.ylim([0.0, 1.05])
|
||||
plt.title(title)
|
||||
plt.savefig(out_image)
|
||||
plt.clf()
|
||||
|
||||
def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
|
||||
out_image=None, title=None):
|
||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
||||
true_pos = 0.0
|
||||
cur_p = 1.0
|
||||
cur_r = 0.0
|
||||
precisions = [1.0]
|
||||
recalls = [0.0]
|
||||
avg_prec = 0.0
|
||||
for i, qid in enumerate(qid_list):
|
||||
if qid_to_has_ans[qid]:
|
||||
true_pos += scores[qid]
|
||||
cur_p = true_pos / float(i+1)
|
||||
cur_r = true_pos / float(num_true_pos)
|
||||
if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
|
||||
# i.e., if we can put a threshold after this point
|
||||
avg_prec += cur_p * (cur_r - recalls[-1])
|
||||
precisions.append(cur_p)
|
||||
recalls.append(cur_r)
|
||||
if out_image:
|
||||
plot_pr_curve(precisions, recalls, out_image, title)
|
||||
return {'ap': 100.0 * avg_prec}
|
||||
|
||||
def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs,
|
||||
qid_to_has_ans, out_image_dir):
|
||||
if out_image_dir and not os.path.exists(out_image_dir):
|
||||
os.makedirs(out_image_dir)
|
||||
num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
|
||||
if num_true_pos == 0:
|
||||
return
|
||||
pr_exact = make_precision_recall_eval(
|
||||
exact_raw, na_probs, num_true_pos, qid_to_has_ans,
|
||||
out_image=os.path.join(out_image_dir, 'pr_exact.png'),
|
||||
title='Precision-Recall curve for Exact Match score')
|
||||
pr_f1 = make_precision_recall_eval(
|
||||
f1_raw, na_probs, num_true_pos, qid_to_has_ans,
|
||||
out_image=os.path.join(out_image_dir, 'pr_f1.png'),
|
||||
title='Precision-Recall curve for F1 score')
|
||||
oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
|
||||
pr_oracle = make_precision_recall_eval(
|
||||
oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
|
||||
out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
|
||||
title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
|
||||
merge_eval(main_eval, pr_exact, 'pr_exact')
|
||||
merge_eval(main_eval, pr_f1, 'pr_f1')
|
||||
merge_eval(main_eval, pr_oracle, 'pr_oracle')
|
||||
|
||||
def histogram_na_prob(na_probs, qid_list, image_dir, name):
|
||||
if not qid_list:
|
||||
return
|
||||
x = [na_probs[k] for k in qid_list]
|
||||
weights = np.ones_like(x) / float(len(x))
|
||||
plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
|
||||
plt.xlabel('Model probability of no-answer')
|
||||
plt.ylabel('Proportion of dataset')
|
||||
plt.title('Histogram of no-answer probability: %s' % name)
|
||||
plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
|
||||
plt.clf()
|
||||
|
||||
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
|
||||
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
||||
cur_score = num_no_ans
|
||||
best_score = cur_score
|
||||
best_thresh = 0.0
|
||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
||||
for i, qid in enumerate(qid_list):
|
||||
if qid not in scores: continue
|
||||
if qid_to_has_ans[qid]:
|
||||
diff = scores[qid]
|
||||
else:
|
||||
if preds[qid]:
|
||||
diff = -1
|
||||
else:
|
||||
diff = 0
|
||||
cur_score += diff
|
||||
if cur_score > best_score:
|
||||
best_score = cur_score
|
||||
best_thresh = na_probs[qid]
|
||||
return 100.0 * best_score / len(scores), best_thresh
|
||||
|
||||
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
|
||||
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
||||
cur_score = num_no_ans
|
||||
best_score = cur_score
|
||||
best_thresh = 0.0
|
||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
||||
for i, qid in enumerate(qid_list):
|
||||
if qid not in scores: continue
|
||||
if qid_to_has_ans[qid]:
|
||||
diff = scores[qid]
|
||||
else:
|
||||
if preds[qid]:
|
||||
diff = -1
|
||||
else:
|
||||
diff = 0
|
||||
cur_score += diff
|
||||
if cur_score > best_score:
|
||||
best_score = cur_score
|
||||
best_thresh = na_probs[qid]
|
||||
|
||||
has_ans_score, has_ans_cnt = 0, 0
|
||||
for qid in qid_list:
|
||||
if not qid_to_has_ans[qid]: continue
|
||||
has_ans_cnt += 1
|
||||
|
||||
if qid not in scores: continue
|
||||
has_ans_score += scores[qid]
|
||||
|
||||
return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
|
||||
|
||||
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
||||
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
|
||||
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
|
||||
main_eval['best_exact'] = best_exact
|
||||
main_eval['best_exact_thresh'] = exact_thresh
|
||||
main_eval['best_f1'] = best_f1
|
||||
main_eval['best_f1_thresh'] = f1_thresh
|
||||
|
||||
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
||||
best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
|
||||
best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
|
||||
main_eval['best_exact'] = best_exact
|
||||
main_eval['best_exact_thresh'] = exact_thresh
|
||||
main_eval['best_f1'] = best_f1
|
||||
main_eval['best_f1_thresh'] = f1_thresh
|
||||
main_eval['has_ans_exact'] = has_ans_exact
|
||||
main_eval['has_ans_f1'] = has_ans_f1
|
||||
|
||||
def main(OPTS):
|
||||
with open(OPTS.data_file) as f:
|
||||
dataset_json = json.load(f)
|
||||
dataset = dataset_json['data']
|
||||
with open(OPTS.pred_file) as f:
|
||||
preds = json.load(f)
|
||||
if OPTS.na_prob_file:
|
||||
with open(OPTS.na_prob_file) as f:
|
||||
na_probs = json.load(f)
|
||||
else:
|
||||
na_probs = {k: 0.0 for k in preds}
|
||||
qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False
|
||||
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
|
||||
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
|
||||
exact_raw, f1_raw = get_raw_scores(dataset, preds)
|
||||
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
|
||||
OPTS.na_prob_thresh)
|
||||
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
|
||||
OPTS.na_prob_thresh)
|
||||
out_eval = make_eval_dict(exact_thresh, f1_thresh)
|
||||
if has_ans_qids:
|
||||
has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
|
||||
merge_eval(out_eval, has_ans_eval, 'HasAns')
|
||||
if no_ans_qids:
|
||||
no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
|
||||
merge_eval(out_eval, no_ans_eval, 'NoAns')
|
||||
if OPTS.na_prob_file:
|
||||
find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
|
||||
if OPTS.na_prob_file and OPTS.out_image_dir:
|
||||
run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs,
|
||||
qid_to_has_ans, OPTS.out_image_dir)
|
||||
histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
|
||||
histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
|
||||
if OPTS.out_file:
|
||||
with open(OPTS.out_file, 'w') as f:
|
||||
json.dump(out_eval, f)
|
||||
else:
|
||||
print(json.dumps(out_eval, indent=2))
|
||||
return out_eval
|
||||
|
||||
if __name__ == '__main__':
|
||||
OPTS = parse_args()
|
||||
if OPTS.out_image_dir:
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
main(OPTS)
|
||||
13
setup.py
13
setup.py
@@ -36,9 +36,15 @@ To create the package for pypi.
|
||||
from io import open
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
extras = {
|
||||
'serving': ['uvicorn', 'fastapi']
|
||||
}
|
||||
extras['all'] = [package for package in extras.values()]
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="2.2.1",
|
||||
version="2.2.2",
|
||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||
author_email="thomas@huggingface.co",
|
||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||
@@ -61,8 +67,11 @@ setup(
|
||||
"transformers=transformers.__main__:main",
|
||||
]
|
||||
},
|
||||
extras_require=extras,
|
||||
scripts=[
|
||||
'transformers-cli'
|
||||
],
|
||||
# python_requires='>=3.5.0',
|
||||
tests_require=['pytest'],
|
||||
classifiers=[
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import XxxConfig, is_tf_available
|
||||
|
||||
@@ -33,10 +33,9 @@ if is_tf_available():
|
||||
TFXxxForTokenClassification,
|
||||
TFXxxForQuestionAnswering,
|
||||
TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
|
||||
@@ -244,7 +243,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in ['xxx-base-uncased']:
|
||||
|
||||
@@ -18,12 +18,12 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
|
||||
@@ -31,10 +31,9 @@ if is_torch_available():
|
||||
XxxForQuestionAnswering, XxxForSequenceClassification,
|
||||
XxxForTokenClassification, XxxForMultipleChoice)
|
||||
from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@require_torch
|
||||
class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
|
||||
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = XxxModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = XxxForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = XxxForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = XxxForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||
result = {
|
||||
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = XxxForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||
result = {
|
||||
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
23
transformers-cli
Normal file
23
transformers-cli
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from transformers.commands.user import UserCommands
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(description='Transformers CLI tool', usage='transformers-cli <command> [<args>]')
|
||||
commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
|
||||
|
||||
# Register commands
|
||||
UserCommands.register_subcommand(commands_parser)
|
||||
|
||||
# Let's go
|
||||
args = parser.parse_args()
|
||||
|
||||
if not hasattr(args, 'func'):
|
||||
parser.print_help()
|
||||
exit(1)
|
||||
|
||||
# Run
|
||||
service = args.func(args)
|
||||
service.run()
|
||||
@@ -1,4 +1,4 @@
|
||||
__version__ = "2.2.1"
|
||||
__version__ = "2.2.2"
|
||||
|
||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||
# default Python logging output behavior when present.
|
||||
@@ -26,7 +26,9 @@ from .data import (is_sklearn_available,
|
||||
InputExample, InputFeatures, DataProcessor,
|
||||
glue_output_modes, glue_convert_examples_to_features,
|
||||
glue_processors, glue_tasks_num_labels,
|
||||
xnli_output_modes, xnli_processors, xnli_tasks_num_labels)
|
||||
xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
|
||||
squad_convert_examples_to_features, SquadFeatures,
|
||||
SquadExample, SquadV1Processor, SquadV2Processor)
|
||||
|
||||
if is_sklearn_available():
|
||||
from .data import glue_compute_metrics, xnli_compute_metrics
|
||||
@@ -35,6 +37,7 @@ if is_sklearn_available():
|
||||
from .tokenization_utils import (PreTrainedTokenizer)
|
||||
from .tokenization_auto import AutoTokenizer
|
||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||
from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
@@ -86,9 +89,10 @@ if is_torch_available():
|
||||
CTRLLMHeadModel,
|
||||
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
||||
XLNetForSequenceClassification, XLNetForMultipleChoice,
|
||||
XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
|
||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
XLNetForSequenceClassification, XLNetForTokenClassification,
|
||||
XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
|
||||
XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
||||
@@ -97,7 +101,7 @@ if is_torch_available():
|
||||
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
||||
RobertaForTokenClassification,
|
||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
||||
from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
|
||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||
DistilBertForTokenClassification,
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
@@ -107,7 +111,7 @@ if is_torch_available():
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||
|
||||
from .modeling_albert import (AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
|
||||
from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
|
||||
AlbertForQuestionAnswering,
|
||||
load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
@@ -144,6 +148,7 @@ if is_tf_available():
|
||||
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
||||
TFXLNetModel, TFXLNetLMHeadModel,
|
||||
TFXLNetForSequenceClassification,
|
||||
TFXLNetForTokenClassification,
|
||||
TFXLNetForQuestionAnsweringSimple,
|
||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
@@ -162,6 +167,7 @@ if is_tf_available():
|
||||
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
||||
TFDistilBertModel, TFDistilBertForMaskedLM,
|
||||
TFDistilBertForSequenceClassification,
|
||||
TFDistilBertForTokenClassification,
|
||||
TFDistilBertForQuestionAnswering,
|
||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
@@ -172,6 +178,8 @@ if is_tf_available():
|
||||
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
|
||||
TFAlbertForSequenceClassification,
|
||||
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
# Optimization
|
||||
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
|
||||
|
||||
# TF 2.0 <=> PyTorch conversion utilities
|
||||
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
||||
|
||||
12
transformers/commands/__init__.py
Normal file
12
transformers/commands/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from argparse import ArgumentParser
|
||||
|
||||
class BaseTransformersCLICommand(ABC):
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def run(self):
|
||||
raise NotImplementedError()
|
||||
194
transformers/commands/user.py
Normal file
194
transformers/commands/user.py
Normal file
@@ -0,0 +1,194 @@
|
||||
from argparse import ArgumentParser
|
||||
from getpass import getpass
|
||||
import os
|
||||
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
from transformers.hf_api import HfApi, HfFolder, HTTPError
|
||||
|
||||
|
||||
class UserCommands(BaseTransformersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
login_parser = parser.add_parser('login')
|
||||
login_parser.set_defaults(func=lambda args: LoginCommand(args))
|
||||
whoami_parser = parser.add_parser('whoami')
|
||||
whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
|
||||
logout_parser = parser.add_parser('logout')
|
||||
logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
|
||||
list_parser = parser.add_parser('ls')
|
||||
list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
||||
# upload
|
||||
upload_parser = parser.add_parser('upload')
|
||||
upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
|
||||
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
|
||||
upload_parser.set_defaults(func=lambda args: UploadCommand(args))
|
||||
|
||||
|
||||
|
||||
class ANSI:
|
||||
"""
|
||||
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
||||
"""
|
||||
_bold = u"\u001b[1m"
|
||||
_reset = u"\u001b[0m"
|
||||
@classmethod
|
||||
def bold(cls, s):
|
||||
return "{}{}{}".format(cls._bold, s, cls._reset)
|
||||
|
||||
|
||||
class BaseUserCommand:
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
self._api = HfApi()
|
||||
|
||||
|
||||
class LoginCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
print("""
|
||||
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||
|
||||
""")
|
||||
username = input("Username: ")
|
||||
password = getpass()
|
||||
try:
|
||||
token = self._api.login(username, password)
|
||||
except HTTPError as e:
|
||||
# probably invalid credentials, display error message.
|
||||
print(e)
|
||||
exit(1)
|
||||
HfFolder.save_token(token)
|
||||
print("Login successful")
|
||||
print("Your token:", token, "\n")
|
||||
print("Your token has been saved to", HfFolder.path_token)
|
||||
|
||||
|
||||
class WhoamiCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit()
|
||||
try:
|
||||
user = self._api.whoami(token)
|
||||
print(user)
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
|
||||
|
||||
class LogoutCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit()
|
||||
HfFolder.delete_token()
|
||||
self._api.logout(token)
|
||||
print("Successfully logged out.")
|
||||
|
||||
|
||||
class ListObjsCommand(BaseUserCommand):
|
||||
def tabulate(self, rows, headers):
|
||||
# type: (List[List[Union[str, int]]], List[str]) -> str
|
||||
"""
|
||||
Inspired by:
|
||||
stackoverflow.com/a/8356620/593036
|
||||
stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
|
||||
"""
|
||||
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
|
||||
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
|
||||
lines = []
|
||||
lines.append(
|
||||
row_format.format(*headers)
|
||||
)
|
||||
lines.append(
|
||||
row_format.format(*["-" * w for w in col_widths])
|
||||
)
|
||||
for row in rows:
|
||||
lines.append(
|
||||
row_format.format(*row)
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit(1)
|
||||
try:
|
||||
objs = self._api.list_objs(token)
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
exit(1)
|
||||
if len(objs) == 0:
|
||||
print("No shared file yet")
|
||||
exit()
|
||||
rows = [ [
|
||||
obj.filename,
|
||||
obj.LastModified,
|
||||
obj.ETag,
|
||||
obj.Size
|
||||
] for obj in objs ]
|
||||
print(
|
||||
self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
|
||||
)
|
||||
|
||||
|
||||
class UploadCommand(BaseUserCommand):
|
||||
def walk_dir(self, rel_path):
|
||||
"""
|
||||
Recursively list all files in a folder.
|
||||
"""
|
||||
entries: List[os.DirEntry] = list(os.scandir(rel_path))
|
||||
files = [
|
||||
(
|
||||
os.path.join(os.getcwd(), f.path), # filepath
|
||||
f.path # filename
|
||||
)
|
||||
for f in entries if f.is_file()
|
||||
]
|
||||
for f in entries:
|
||||
if f.is_dir():
|
||||
files += self.walk_dir(f.path)
|
||||
return files
|
||||
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit(1)
|
||||
local_path = os.path.abspath(self.args.path)
|
||||
if os.path.isdir(local_path):
|
||||
if self.args.filename is not None:
|
||||
raise ValueError("Cannot specify a filename override when uploading a folder.")
|
||||
rel_path = os.path.basename(local_path)
|
||||
files = self.walk_dir(rel_path)
|
||||
elif os.path.isfile(local_path):
|
||||
filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
|
||||
files = [(local_path, filename)]
|
||||
else:
|
||||
raise ValueError("Not a valid file or directory: {}".format(local_path))
|
||||
|
||||
for filepath, filename in files:
|
||||
print(
|
||||
"About to upload file {} to S3 under filename {}".format(
|
||||
ANSI.bold(filepath), ANSI.bold(filename)
|
||||
)
|
||||
)
|
||||
|
||||
choice = input("Proceed? [Y/n] ").lower()
|
||||
if not(choice == "" or choice == "y" or choice == "yes"):
|
||||
print("Abort")
|
||||
exit()
|
||||
print(
|
||||
ANSI.bold("Uploading... This might take a while if files are large")
|
||||
)
|
||||
for filepath, filename in files:
|
||||
access_url = self._api.presign_and_upload(
|
||||
token=token, filename=filename, filepath=filepath
|
||||
)
|
||||
print("Your file now lives at:")
|
||||
print(access_url)
|
||||
@@ -83,6 +83,7 @@ class AutoConfig(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||
|
||||
|
||||
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
|
||||
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
|
||||
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
|
||||
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
|
||||
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
|
||||
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
|
||||
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ import logging
|
||||
import os
|
||||
from io import open
|
||||
|
||||
from .file_utils import cached_path, CONFIG_NAME
|
||||
from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -79,6 +79,7 @@ class PretrainedConfig(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||
|
||||
@@ -131,8 +132,10 @@ class PretrainedConfig(object):
|
||||
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
|
||||
elif os.path.isdir(pretrained_model_name_or_path):
|
||||
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
||||
else:
|
||||
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||
config_file = pretrained_model_name_or_path
|
||||
else:
|
||||
config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
|
||||
@@ -187,7 +190,7 @@ class PretrainedConfig(object):
|
||||
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file):
|
||||
"""Constructs a `BertConfig` from a json file of parameters."""
|
||||
"""Constructs a `Config` from a json file of parameters."""
|
||||
with open(json_file, "r", encoding='utf-8') as reader:
|
||||
text = reader.read()
|
||||
return cls.from_dict(json.loads(text))
|
||||
|
||||
@@ -119,10 +119,11 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
|
||||
tf_inputs = tf.constant(inputs_list)
|
||||
tfo = tf_model(tf_inputs, training=False) # build the network
|
||||
|
||||
pt_model = pt_model_class.from_pretrained(None,
|
||||
state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
|
||||
pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
|
||||
config=config,
|
||||
state_dict=torch.load(pytorch_checkpoint_path,
|
||||
map_location='cpu'))
|
||||
state_dict=state_dict)
|
||||
|
||||
pt_inputs = torch.tensor(inputs_list)
|
||||
with torch.no_grad():
|
||||
pto = pt_model(pt_inputs)
|
||||
@@ -139,7 +140,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
|
||||
|
||||
|
||||
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
|
||||
compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
|
||||
compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
|
||||
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
|
||||
|
||||
if args_model_type is None:
|
||||
@@ -187,13 +188,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
|
||||
|
||||
if os.path.isfile(model_shortcut_name):
|
||||
model_shortcut_name = 'converted_model'
|
||||
|
||||
convert_pt_checkpoint_to_tf(model_type=model_type,
|
||||
pytorch_checkpoint_path=model_file,
|
||||
config_file=config_file,
|
||||
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
|
||||
compare_with_pt_model=compare_with_pt_model)
|
||||
os.remove(config_file)
|
||||
os.remove(model_file)
|
||||
if remove_cached_files:
|
||||
os.remove(config_file)
|
||||
os.remove(model_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -226,6 +229,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--use_cached_models",
|
||||
action='store_true',
|
||||
help = "Use cached models if possible instead of updating to latest checkpoint versions.")
|
||||
parser.add_argument("--remove_cached_files",
|
||||
action='store_true',
|
||||
help = "Remove pytorch models after conversion (save memory when converting in batches).")
|
||||
parser.add_argument("--only_convert_finetuned_models",
|
||||
action='store_true',
|
||||
help = "Only convert finetuned models.")
|
||||
@@ -245,4 +251,5 @@ if __name__ == "__main__":
|
||||
config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
|
||||
compare_with_pt_model=args.compare_with_pt_model,
|
||||
use_cached_models=args.use_cached_models,
|
||||
remove_cached_files=args.remove_cached_files,
|
||||
only_convert_finetuned_models=args.only_convert_finetuned_models)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from .processors import InputExample, InputFeatures, DataProcessor
|
||||
from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
|
||||
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||
from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
|
||||
from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||
|
||||
from .metrics import is_sklearn_available
|
||||
|
||||
763
transformers/data/metrics/squad_metrics.py
Normal file
763
transformers/data/metrics/squad_metrics.py
Normal file
@@ -0,0 +1,763 @@
|
||||
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
|
||||
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
|
||||
|
||||
In addition to basic functionality, we also compute additional statistics and
|
||||
plot precision-recall curves if an additional na_prob.json file is provided.
|
||||
This file is expected to map question ID's to the model's predicted probability
|
||||
that a question is unanswerable.
|
||||
"""
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import collections
|
||||
from io import open
|
||||
from tqdm import tqdm
|
||||
import string
|
||||
import re
|
||||
|
||||
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def normalize_answer(s):
|
||||
"""Lower text and remove punctuation, articles and extra whitespace."""
|
||||
def remove_articles(text):
|
||||
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
|
||||
return re.sub(regex, ' ', text)
|
||||
|
||||
def white_space_fix(text):
|
||||
return ' '.join(text.split())
|
||||
|
||||
def remove_punc(text):
|
||||
exclude = set(string.punctuation)
|
||||
return ''.join(ch for ch in text if ch not in exclude)
|
||||
|
||||
def lower(text):
|
||||
return text.lower()
|
||||
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
||||
|
||||
|
||||
def get_tokens(s):
|
||||
if not s:
|
||||
return []
|
||||
return normalize_answer(s).split()
|
||||
|
||||
|
||||
def compute_exact(a_gold, a_pred):
|
||||
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
|
||||
|
||||
|
||||
def compute_f1(a_gold, a_pred):
|
||||
gold_toks = get_tokens(a_gold)
|
||||
pred_toks = get_tokens(a_pred)
|
||||
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
|
||||
num_same = sum(common.values())
|
||||
if len(gold_toks) == 0 or len(pred_toks) == 0:
|
||||
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
|
||||
return int(gold_toks == pred_toks)
|
||||
if num_same == 0:
|
||||
return 0
|
||||
precision = 1.0 * num_same / len(pred_toks)
|
||||
recall = 1.0 * num_same / len(gold_toks)
|
||||
f1 = (2 * precision * recall) / (precision + recall)
|
||||
return f1
|
||||
|
||||
|
||||
def get_raw_scores(examples, preds):
|
||||
"""
|
||||
Computes the exact and f1 scores from the examples and the model predictions
|
||||
"""
|
||||
exact_scores = {}
|
||||
f1_scores = {}
|
||||
|
||||
for example in examples:
|
||||
qas_id = example.qas_id
|
||||
gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
|
||||
|
||||
if not gold_answers:
|
||||
# For unanswerable questions, only correct answer is empty string
|
||||
gold_answers = ['']
|
||||
|
||||
if qas_id not in preds:
|
||||
print('Missing prediction for %s' % qas_id)
|
||||
continue
|
||||
|
||||
prediction = preds[qas_id]
|
||||
exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
|
||||
f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
|
||||
|
||||
return exact_scores, f1_scores
|
||||
|
||||
|
||||
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
|
||||
new_scores = {}
|
||||
for qid, s in scores.items():
|
||||
pred_na = na_probs[qid] > na_prob_thresh
|
||||
if pred_na:
|
||||
new_scores[qid] = float(not qid_to_has_ans[qid])
|
||||
else:
|
||||
new_scores[qid] = s
|
||||
return new_scores
|
||||
|
||||
|
||||
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
|
||||
if not qid_list:
|
||||
total = len(exact_scores)
|
||||
return collections.OrderedDict([
|
||||
('exact', 100.0 * sum(exact_scores.values()) / total),
|
||||
('f1', 100.0 * sum(f1_scores.values()) / total),
|
||||
('total', total),
|
||||
])
|
||||
else:
|
||||
total = len(qid_list)
|
||||
return collections.OrderedDict([
|
||||
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
|
||||
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
|
||||
('total', total),
|
||||
])
|
||||
|
||||
|
||||
def merge_eval(main_eval, new_eval, prefix):
|
||||
for k in new_eval:
|
||||
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
|
||||
|
||||
|
||||
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
|
||||
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
||||
cur_score = num_no_ans
|
||||
best_score = cur_score
|
||||
best_thresh = 0.0
|
||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
||||
for i, qid in enumerate(qid_list):
|
||||
if qid not in scores:
|
||||
continue
|
||||
if qid_to_has_ans[qid]:
|
||||
diff = scores[qid]
|
||||
else:
|
||||
if preds[qid]:
|
||||
diff = -1
|
||||
else:
|
||||
diff = 0
|
||||
cur_score += diff
|
||||
if cur_score > best_score:
|
||||
best_score = cur_score
|
||||
best_thresh = na_probs[qid]
|
||||
|
||||
has_ans_score, has_ans_cnt = 0, 0
|
||||
for qid in qid_list:
|
||||
if not qid_to_has_ans[qid]:
|
||||
continue
|
||||
has_ans_cnt += 1
|
||||
|
||||
if qid not in scores:
|
||||
continue
|
||||
has_ans_score += scores[qid]
|
||||
|
||||
return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
|
||||
|
||||
|
||||
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
||||
best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
|
||||
preds, exact_raw, na_probs, qid_to_has_ans)
|
||||
best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
|
||||
preds, f1_raw, na_probs, qid_to_has_ans)
|
||||
main_eval['best_exact'] = best_exact
|
||||
main_eval['best_exact_thresh'] = exact_thresh
|
||||
main_eval['best_f1'] = best_f1
|
||||
main_eval['best_f1_thresh'] = f1_thresh
|
||||
main_eval['has_ans_exact'] = has_ans_exact
|
||||
main_eval['has_ans_f1'] = has_ans_f1
|
||||
|
||||
|
||||
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
|
||||
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
||||
cur_score = num_no_ans
|
||||
best_score = cur_score
|
||||
best_thresh = 0.0
|
||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
||||
for _, qid in enumerate(qid_list):
|
||||
if qid not in scores:
|
||||
continue
|
||||
if qid_to_has_ans[qid]:
|
||||
diff = scores[qid]
|
||||
else:
|
||||
if preds[qid]:
|
||||
diff = -1
|
||||
else:
|
||||
diff = 0
|
||||
cur_score += diff
|
||||
if cur_score > best_score:
|
||||
best_score = cur_score
|
||||
best_thresh = na_probs[qid]
|
||||
return 100.0 * best_score / len(scores), best_thresh
|
||||
|
||||
|
||||
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
||||
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
|
||||
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
|
||||
|
||||
main_eval['best_exact'] = best_exact
|
||||
main_eval['best_exact_thresh'] = exact_thresh
|
||||
main_eval['best_f1'] = best_f1
|
||||
main_eval['best_f1_thresh'] = f1_thresh
|
||||
|
||||
|
||||
def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
|
||||
qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
|
||||
has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
|
||||
no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
|
||||
|
||||
if no_answer_probs is None:
|
||||
no_answer_probs = {k: 0.0 for k in preds}
|
||||
|
||||
exact, f1 = get_raw_scores(examples, preds)
|
||||
|
||||
exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
|
||||
f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
|
||||
|
||||
evaluation = make_eval_dict(exact_threshold, f1_threshold)
|
||||
|
||||
if has_answer_qids:
|
||||
has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
|
||||
merge_eval(evaluation, has_ans_eval, 'HasAns')
|
||||
|
||||
if no_answer_qids:
|
||||
no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
|
||||
merge_eval(evaluation, no_ans_eval, 'NoAns')
|
||||
|
||||
if no_answer_probs:
|
||||
find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
|
||||
|
||||
return evaluation
|
||||
|
||||
|
||||
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
|
||||
"""Project the tokenized prediction back to the original text."""
|
||||
|
||||
# When we created the data, we kept track of the alignment between original
|
||||
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
|
||||
# now `orig_text` contains the span of our original text corresponding to the
|
||||
# span that we predicted.
|
||||
#
|
||||
# However, `orig_text` may contain extra characters that we don't want in
|
||||
# our prediction.
|
||||
#
|
||||
# For example, let's say:
|
||||
# pred_text = steve smith
|
||||
# orig_text = Steve Smith's
|
||||
#
|
||||
# We don't want to return `orig_text` because it contains the extra "'s".
|
||||
#
|
||||
# We don't want to return `pred_text` because it's already been normalized
|
||||
# (the SQuAD eval script also does punctuation stripping/lower casing but
|
||||
# our tokenizer does additional normalization like stripping accent
|
||||
# characters).
|
||||
#
|
||||
# What we really want to return is "Steve Smith".
|
||||
#
|
||||
# Therefore, we have to apply a semi-complicated alignment heuristic between
|
||||
# `pred_text` and `orig_text` to get a character-to-character alignment. This
|
||||
# can fail in certain cases in which case we just return `orig_text`.
|
||||
|
||||
def _strip_spaces(text):
|
||||
ns_chars = []
|
||||
ns_to_s_map = collections.OrderedDict()
|
||||
for (i, c) in enumerate(text):
|
||||
if c == " ":
|
||||
continue
|
||||
ns_to_s_map[len(ns_chars)] = i
|
||||
ns_chars.append(c)
|
||||
ns_text = "".join(ns_chars)
|
||||
return (ns_text, ns_to_s_map)
|
||||
|
||||
# We first tokenize `orig_text`, strip whitespace from the result
|
||||
# and `pred_text`, and check if they are the same length. If they are
|
||||
# NOT the same length, the heuristic has failed. If they are the same
|
||||
# length, we assume the characters are one-to-one aligned.
|
||||
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
|
||||
|
||||
tok_text = " ".join(tokenizer.tokenize(orig_text))
|
||||
|
||||
start_position = tok_text.find(pred_text)
|
||||
if start_position == -1:
|
||||
if verbose_logging:
|
||||
logger.info(
|
||||
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
|
||||
return orig_text
|
||||
end_position = start_position + len(pred_text) - 1
|
||||
|
||||
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
|
||||
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
|
||||
|
||||
if len(orig_ns_text) != len(tok_ns_text):
|
||||
if verbose_logging:
|
||||
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
|
||||
orig_ns_text, tok_ns_text)
|
||||
return orig_text
|
||||
|
||||
# We then project the characters in `pred_text` back to `orig_text` using
|
||||
# the character-to-character alignment.
|
||||
tok_s_to_ns_map = {}
|
||||
for (i, tok_index) in tok_ns_to_s_map.items():
|
||||
tok_s_to_ns_map[tok_index] = i
|
||||
|
||||
orig_start_position = None
|
||||
if start_position in tok_s_to_ns_map:
|
||||
ns_start_position = tok_s_to_ns_map[start_position]
|
||||
if ns_start_position in orig_ns_to_s_map:
|
||||
orig_start_position = orig_ns_to_s_map[ns_start_position]
|
||||
|
||||
if orig_start_position is None:
|
||||
if verbose_logging:
|
||||
logger.info("Couldn't map start position")
|
||||
return orig_text
|
||||
|
||||
orig_end_position = None
|
||||
if end_position in tok_s_to_ns_map:
|
||||
ns_end_position = tok_s_to_ns_map[end_position]
|
||||
if ns_end_position in orig_ns_to_s_map:
|
||||
orig_end_position = orig_ns_to_s_map[ns_end_position]
|
||||
|
||||
if orig_end_position is None:
|
||||
if verbose_logging:
|
||||
logger.info("Couldn't map end position")
|
||||
return orig_text
|
||||
|
||||
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
|
||||
return output_text
|
||||
|
||||
|
||||
def _get_best_indexes(logits, n_best_size):
|
||||
"""Get the n-best logits from a list."""
|
||||
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
|
||||
|
||||
best_indexes = []
|
||||
for i in range(len(index_and_score)):
|
||||
if i >= n_best_size:
|
||||
break
|
||||
best_indexes.append(index_and_score[i][0])
|
||||
return best_indexes
|
||||
|
||||
|
||||
def _compute_softmax(scores):
|
||||
"""Compute softmax probability over raw logits."""
|
||||
if not scores:
|
||||
return []
|
||||
|
||||
max_score = None
|
||||
for score in scores:
|
||||
if max_score is None or score > max_score:
|
||||
max_score = score
|
||||
|
||||
exp_scores = []
|
||||
total_sum = 0.0
|
||||
for score in scores:
|
||||
x = math.exp(score - max_score)
|
||||
exp_scores.append(x)
|
||||
total_sum += x
|
||||
|
||||
probs = []
|
||||
for score in exp_scores:
|
||||
probs.append(score / total_sum)
|
||||
return probs
|
||||
|
||||
|
||||
def compute_predictions_logits(
|
||||
all_examples,
|
||||
all_features,
|
||||
all_results,
|
||||
n_best_size,
|
||||
max_answer_length,
|
||||
do_lower_case,
|
||||
output_prediction_file,
|
||||
output_nbest_file,
|
||||
output_null_log_odds_file,
|
||||
verbose_logging,
|
||||
version_2_with_negative,
|
||||
null_score_diff_threshold
|
||||
):
|
||||
"""Write final predictions to the json file and log-odds of null if needed."""
|
||||
logger.info("Writing predictions to: %s" % (output_prediction_file))
|
||||
logger.info("Writing nbest to: %s" % (output_nbest_file))
|
||||
|
||||
example_index_to_features = collections.defaultdict(list)
|
||||
for feature in all_features:
|
||||
example_index_to_features[feature.example_index].append(feature)
|
||||
|
||||
unique_id_to_result = {}
|
||||
for result in all_results:
|
||||
unique_id_to_result[result.unique_id] = result
|
||||
|
||||
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||
"PrelimPrediction",
|
||||
["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
|
||||
|
||||
all_predictions = collections.OrderedDict()
|
||||
all_nbest_json = collections.OrderedDict()
|
||||
scores_diff_json = collections.OrderedDict()
|
||||
|
||||
for (example_index, example) in enumerate(all_examples):
|
||||
features = example_index_to_features[example_index]
|
||||
|
||||
prelim_predictions = []
|
||||
# keep track of the minimum score of null start+end of position 0
|
||||
score_null = 1000000 # large and positive
|
||||
min_null_feature_index = 0 # the paragraph slice with min null score
|
||||
null_start_logit = 0 # the start logit at the slice with min null score
|
||||
null_end_logit = 0 # the end logit at the slice with min null score
|
||||
for (feature_index, feature) in enumerate(features):
|
||||
result = unique_id_to_result[feature.unique_id]
|
||||
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
|
||||
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
|
||||
# if we could have irrelevant answers, get the min score of irrelevant
|
||||
if version_2_with_negative:
|
||||
feature_null_score = result.start_logits[0] + result.end_logits[0]
|
||||
if feature_null_score < score_null:
|
||||
score_null = feature_null_score
|
||||
min_null_feature_index = feature_index
|
||||
null_start_logit = result.start_logits[0]
|
||||
null_end_logit = result.end_logits[0]
|
||||
for start_index in start_indexes:
|
||||
for end_index in end_indexes:
|
||||
# We could hypothetically create invalid predictions, e.g., predict
|
||||
# that the start of the span is in the question. We throw out all
|
||||
# invalid predictions.
|
||||
if start_index >= len(feature.tokens):
|
||||
continue
|
||||
if end_index >= len(feature.tokens):
|
||||
continue
|
||||
if start_index not in feature.token_to_orig_map:
|
||||
continue
|
||||
if end_index not in feature.token_to_orig_map:
|
||||
continue
|
||||
if not feature.token_is_max_context.get(start_index, False):
|
||||
continue
|
||||
if end_index < start_index:
|
||||
continue
|
||||
length = end_index - start_index + 1
|
||||
if length > max_answer_length:
|
||||
continue
|
||||
prelim_predictions.append(
|
||||
_PrelimPrediction(
|
||||
feature_index=feature_index,
|
||||
start_index=start_index,
|
||||
end_index=end_index,
|
||||
start_logit=result.start_logits[start_index],
|
||||
end_logit=result.end_logits[end_index]))
|
||||
if version_2_with_negative:
|
||||
prelim_predictions.append(
|
||||
_PrelimPrediction(
|
||||
feature_index=min_null_feature_index,
|
||||
start_index=0,
|
||||
end_index=0,
|
||||
start_logit=null_start_logit,
|
||||
end_logit=null_end_logit))
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions,
|
||||
key=lambda x: (x.start_logit + x.end_logit),
|
||||
reverse=True)
|
||||
|
||||
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||
"NbestPrediction", ["text", "start_logit", "end_logit"])
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
for pred in prelim_predictions:
|
||||
if len(nbest) >= n_best_size:
|
||||
break
|
||||
feature = features[pred.feature_index]
|
||||
if pred.start_index > 0: # this is a non-null prediction
|
||||
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
|
||||
orig_doc_start = feature.token_to_orig_map[pred.start_index]
|
||||
orig_doc_end = feature.token_to_orig_map[pred.end_index]
|
||||
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
|
||||
tok_text = " ".join(tok_tokens)
|
||||
|
||||
# De-tokenize WordPieces that have been split off.
|
||||
tok_text = tok_text.replace(" ##", "")
|
||||
tok_text = tok_text.replace("##", "")
|
||||
|
||||
# Clean whitespace
|
||||
tok_text = tok_text.strip()
|
||||
tok_text = " ".join(tok_text.split())
|
||||
orig_text = " ".join(orig_tokens)
|
||||
|
||||
final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
|
||||
if final_text in seen_predictions:
|
||||
continue
|
||||
|
||||
seen_predictions[final_text] = True
|
||||
else:
|
||||
final_text = ""
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text,
|
||||
start_logit=pred.start_logit,
|
||||
end_logit=pred.end_logit))
|
||||
# if we didn't include the empty option in the n-best, include it
|
||||
if version_2_with_negative:
|
||||
if "" not in seen_predictions:
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text="",
|
||||
start_logit=null_start_logit,
|
||||
end_logit=null_end_logit))
|
||||
|
||||
# In very rare edge cases we could only have single null prediction.
|
||||
# So we just create a nonce prediction in this case to avoid failure.
|
||||
if len(nbest) == 1:
|
||||
nbest.insert(0,
|
||||
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
|
||||
|
||||
# In very rare edge cases we could have no valid predictions. So we
|
||||
# just create a nonce prediction in this case to avoid failure.
|
||||
if not nbest:
|
||||
nbest.append(
|
||||
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
|
||||
|
||||
assert len(nbest) >= 1
|
||||
|
||||
total_scores = []
|
||||
best_non_null_entry = None
|
||||
for entry in nbest:
|
||||
total_scores.append(entry.start_logit + entry.end_logit)
|
||||
if not best_non_null_entry:
|
||||
if entry.text:
|
||||
best_non_null_entry = entry
|
||||
|
||||
probs = _compute_softmax(total_scores)
|
||||
|
||||
nbest_json = []
|
||||
for (i, entry) in enumerate(nbest):
|
||||
output = collections.OrderedDict()
|
||||
output["text"] = entry.text
|
||||
output["probability"] = probs[i]
|
||||
output["start_logit"] = entry.start_logit
|
||||
output["end_logit"] = entry.end_logit
|
||||
nbest_json.append(output)
|
||||
|
||||
assert len(nbest_json) >= 1
|
||||
|
||||
if not version_2_with_negative:
|
||||
all_predictions[example.qas_id] = nbest_json[0]["text"]
|
||||
else:
|
||||
# predict "" iff the null score - the score of best non-null > threshold
|
||||
score_diff = score_null - best_non_null_entry.start_logit - (
|
||||
best_non_null_entry.end_logit)
|
||||
scores_diff_json[example.qas_id] = score_diff
|
||||
if score_diff > null_score_diff_threshold:
|
||||
all_predictions[example.qas_id] = ""
|
||||
else:
|
||||
all_predictions[example.qas_id] = best_non_null_entry.text
|
||||
all_nbest_json[example.qas_id] = nbest_json
|
||||
|
||||
with open(output_prediction_file, "w") as writer:
|
||||
writer.write(json.dumps(all_predictions, indent=4) + "\n")
|
||||
|
||||
with open(output_nbest_file, "w") as writer:
|
||||
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
|
||||
|
||||
if version_2_with_negative:
|
||||
with open(output_null_log_odds_file, "w") as writer:
|
||||
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
|
||||
|
||||
return all_predictions
|
||||
|
||||
|
||||
def compute_predictions_log_probs(
|
||||
all_examples,
|
||||
all_features,
|
||||
all_results,
|
||||
n_best_size,
|
||||
max_answer_length,
|
||||
output_prediction_file,
|
||||
output_nbest_file,
|
||||
output_null_log_odds_file,
|
||||
start_n_top,
|
||||
end_n_top,
|
||||
version_2_with_negative,
|
||||
tokenizer,
|
||||
verbose_logging
|
||||
):
|
||||
""" XLNet write prediction logic (more complex than Bert's).
|
||||
Write final predictions to the json file and log-odds of null if needed.
|
||||
|
||||
Requires utils_squad_evaluate.py
|
||||
"""
|
||||
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||
"PrelimPrediction",
|
||||
["feature_index", "start_index", "end_index",
|
||||
"start_log_prob", "end_log_prob"])
|
||||
|
||||
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||
"NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
|
||||
|
||||
logger.info("Writing predictions to: %s", output_prediction_file)
|
||||
# logger.info("Writing nbest to: %s" % (output_nbest_file))
|
||||
|
||||
example_index_to_features = collections.defaultdict(list)
|
||||
for feature in all_features:
|
||||
example_index_to_features[feature.example_index].append(feature)
|
||||
|
||||
unique_id_to_result = {}
|
||||
for result in all_results:
|
||||
unique_id_to_result[result.unique_id] = result
|
||||
|
||||
all_predictions = collections.OrderedDict()
|
||||
all_nbest_json = collections.OrderedDict()
|
||||
scores_diff_json = collections.OrderedDict()
|
||||
|
||||
for (example_index, example) in enumerate(all_examples):
|
||||
features = example_index_to_features[example_index]
|
||||
|
||||
prelim_predictions = []
|
||||
# keep track of the minimum score of null start+end of position 0
|
||||
score_null = 1000000 # large and positive
|
||||
|
||||
for (feature_index, feature) in enumerate(features):
|
||||
result = unique_id_to_result[feature.unique_id]
|
||||
|
||||
cur_null_score = result.cls_logits
|
||||
|
||||
# if we could have irrelevant answers, get the min score of irrelevant
|
||||
score_null = min(score_null, cur_null_score)
|
||||
|
||||
for i in range(start_n_top):
|
||||
for j in range(end_n_top):
|
||||
start_log_prob = result.start_logits[i]
|
||||
start_index = result.start_top_index[i]
|
||||
|
||||
j_index = i * end_n_top + j
|
||||
|
||||
end_log_prob = result.end_logits[j_index]
|
||||
end_index = result.end_top_index[j_index]
|
||||
|
||||
# We could hypothetically create invalid predictions, e.g., predict
|
||||
# that the start of the span is in the question. We throw out all
|
||||
# invalid predictions.
|
||||
if start_index >= feature.paragraph_len - 1:
|
||||
continue
|
||||
if end_index >= feature.paragraph_len - 1:
|
||||
continue
|
||||
|
||||
if not feature.token_is_max_context.get(start_index, False):
|
||||
continue
|
||||
if end_index < start_index:
|
||||
continue
|
||||
length = end_index - start_index + 1
|
||||
if length > max_answer_length:
|
||||
continue
|
||||
|
||||
prelim_predictions.append(
|
||||
_PrelimPrediction(
|
||||
feature_index=feature_index,
|
||||
start_index=start_index,
|
||||
end_index=end_index,
|
||||
start_log_prob=start_log_prob,
|
||||
end_log_prob=end_log_prob))
|
||||
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions,
|
||||
key=lambda x: (x.start_log_prob + x.end_log_prob),
|
||||
reverse=True)
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
for pred in prelim_predictions:
|
||||
if len(nbest) >= n_best_size:
|
||||
break
|
||||
feature = features[pred.feature_index]
|
||||
|
||||
# XLNet un-tokenizer
|
||||
# Let's keep it simple for now and see if we need all this later.
|
||||
#
|
||||
# tok_start_to_orig_index = feature.tok_start_to_orig_index
|
||||
# tok_end_to_orig_index = feature.tok_end_to_orig_index
|
||||
# start_orig_pos = tok_start_to_orig_index[pred.start_index]
|
||||
# end_orig_pos = tok_end_to_orig_index[pred.end_index]
|
||||
# paragraph_text = example.paragraph_text
|
||||
# final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
|
||||
|
||||
# Previously used Bert untokenizer
|
||||
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
|
||||
orig_doc_start = feature.token_to_orig_map[pred.start_index]
|
||||
orig_doc_end = feature.token_to_orig_map[pred.end_index]
|
||||
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
|
||||
tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
|
||||
|
||||
# Clean whitespace
|
||||
tok_text = tok_text.strip()
|
||||
tok_text = " ".join(tok_text.split())
|
||||
orig_text = " ".join(orig_tokens)
|
||||
|
||||
if hasattr(tokenizer, "do_lower_case"):
|
||||
do_lower_case = tokenizer.do_lower_case
|
||||
else:
|
||||
do_lower_case = tokenizer.do_lowercase_and_remove_accent
|
||||
|
||||
final_text = get_final_text(tok_text, orig_text, do_lower_case,
|
||||
verbose_logging)
|
||||
|
||||
if final_text in seen_predictions:
|
||||
continue
|
||||
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text,
|
||||
start_log_prob=pred.start_log_prob,
|
||||
end_log_prob=pred.end_log_prob))
|
||||
|
||||
# In very rare edge cases we could have no valid predictions. So we
|
||||
# just create a nonce prediction in this case to avoid failure.
|
||||
if not nbest:
|
||||
nbest.append(
|
||||
_NbestPrediction(text="", start_log_prob=-1e6,
|
||||
end_log_prob=-1e6))
|
||||
|
||||
total_scores = []
|
||||
best_non_null_entry = None
|
||||
for entry in nbest:
|
||||
total_scores.append(entry.start_log_prob + entry.end_log_prob)
|
||||
if not best_non_null_entry:
|
||||
best_non_null_entry = entry
|
||||
|
||||
probs = _compute_softmax(total_scores)
|
||||
|
||||
nbest_json = []
|
||||
for (i, entry) in enumerate(nbest):
|
||||
output = collections.OrderedDict()
|
||||
output["text"] = entry.text
|
||||
output["probability"] = probs[i]
|
||||
output["start_log_prob"] = entry.start_log_prob
|
||||
output["end_log_prob"] = entry.end_log_prob
|
||||
nbest_json.append(output)
|
||||
|
||||
assert len(nbest_json) >= 1
|
||||
assert best_non_null_entry is not None
|
||||
|
||||
score_diff = score_null
|
||||
scores_diff_json[example.qas_id] = score_diff
|
||||
# note(zhiliny): always predict best_non_null_entry
|
||||
# and the evaluation script will search for the best threshold
|
||||
all_predictions[example.qas_id] = best_non_null_entry.text
|
||||
|
||||
all_nbest_json[example.qas_id] = nbest_json
|
||||
|
||||
with open(output_prediction_file, "w") as writer:
|
||||
writer.write(json.dumps(all_predictions, indent=4) + "\n")
|
||||
|
||||
with open(output_nbest_file, "w") as writer:
|
||||
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
|
||||
|
||||
if version_2_with_negative:
|
||||
with open(output_null_log_odds_file, "w") as writer:
|
||||
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
|
||||
|
||||
return all_predictions
|
||||
@@ -1,3 +1,4 @@
|
||||
from .utils import InputExample, InputFeatures, DataProcessor
|
||||
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||
from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
|
||||
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||
@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
|
||||
if is_tf_available() and is_tf_dataset:
|
||||
def gen():
|
||||
for ex in features:
|
||||
yield ({'input_ids': ex.input_ids,
|
||||
yield ({'input_ids': ex.input_ids,
|
||||
'attention_mask': ex.attention_mask,
|
||||
'token_type_ids': ex.token_type_ids},
|
||||
ex.label)
|
||||
|
||||
653
transformers/data/processors/squad.py
Normal file
653
transformers/data/processors/squad.py
Normal file
@@ -0,0 +1,653 @@
|
||||
from tqdm import tqdm
|
||||
import collections
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
from .utils import DataProcessor, InputExample, InputFeatures
|
||||
from ...file_utils import is_tf_available, is_torch_available
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
|
||||
"""Returns tokenized answer spans that better match the annotated answer."""
|
||||
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
|
||||
|
||||
for new_start in range(input_start, input_end + 1):
|
||||
for new_end in range(input_end, new_start - 1, -1):
|
||||
text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
|
||||
if text_span == tok_answer_text:
|
||||
return (new_start, new_end)
|
||||
|
||||
return (input_start, input_end)
|
||||
|
||||
|
||||
def _check_is_max_context(doc_spans, cur_span_index, position):
|
||||
"""Check if this is the 'max context' doc span for the token."""
|
||||
best_score = None
|
||||
best_span_index = None
|
||||
for (span_index, doc_span) in enumerate(doc_spans):
|
||||
end = doc_span.start + doc_span.length - 1
|
||||
if position < doc_span.start:
|
||||
continue
|
||||
if position > end:
|
||||
continue
|
||||
num_left_context = position - doc_span.start
|
||||
num_right_context = end - position
|
||||
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
|
||||
if best_score is None or score > best_score:
|
||||
best_score = score
|
||||
best_span_index = span_index
|
||||
|
||||
return cur_span_index == best_span_index
|
||||
|
||||
|
||||
def _new_check_is_max_context(doc_spans, cur_span_index, position):
|
||||
"""Check if this is the 'max context' doc span for the token."""
|
||||
# if len(doc_spans) == 1:
|
||||
# return True
|
||||
best_score = None
|
||||
best_span_index = None
|
||||
for (span_index, doc_span) in enumerate(doc_spans):
|
||||
end = doc_span["start"] + doc_span["length"] - 1
|
||||
if position < doc_span["start"]:
|
||||
continue
|
||||
if position > end:
|
||||
continue
|
||||
num_left_context = position - doc_span["start"]
|
||||
num_right_context = end - position
|
||||
score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
|
||||
if best_score is None or score > best_score:
|
||||
best_score = score
|
||||
best_span_index = span_index
|
||||
|
||||
return cur_span_index == best_span_index
|
||||
|
||||
|
||||
def _is_whitespace(c):
|
||||
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def squad_convert_examples_to_features(
|
||||
examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False
|
||||
):
|
||||
"""
|
||||
Converts a list of examples into a list of features that can be directly given as input to a model.
|
||||
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
|
||||
|
||||
Args:
|
||||
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
|
||||
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
|
||||
max_seq_length: The maximum sequence length of the inputs.
|
||||
doc_stride: The stride used when the context is too large and is split across several features.
|
||||
max_query_length: The maximum length of the query.
|
||||
is_training: whether to create features for model evaluation or model training.
|
||||
return_dataset: Default False. Either 'pt' or 'tf'.
|
||||
if 'pt': returns a torch.data.TensorDataset,
|
||||
if 'tf': returns a tf.data.Dataset
|
||||
|
||||
Returns:
|
||||
list of :class:`~transformers.data.processors.squad.SquadFeatures`
|
||||
|
||||
Example::
|
||||
|
||||
processor = SquadV2Processor()
|
||||
examples = processor.get_dev_examples(data_dir)
|
||||
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=not evaluate,
|
||||
)
|
||||
"""
|
||||
|
||||
# Defining helper methods
|
||||
unique_id = 1000000000
|
||||
|
||||
features = []
|
||||
for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")):
|
||||
if is_training and not example.is_impossible:
|
||||
# Get start and end position
|
||||
start_position = example.start_position
|
||||
end_position = example.end_position
|
||||
|
||||
# If the answer cannot be found in the text, then skip this example.
|
||||
actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
|
||||
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
|
||||
if actual_text.find(cleaned_answer_text) == -1:
|
||||
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
|
||||
continue
|
||||
|
||||
tok_to_orig_index = []
|
||||
orig_to_tok_index = []
|
||||
all_doc_tokens = []
|
||||
for (i, token) in enumerate(example.doc_tokens):
|
||||
orig_to_tok_index.append(len(all_doc_tokens))
|
||||
sub_tokens = tokenizer.tokenize(token)
|
||||
for sub_token in sub_tokens:
|
||||
tok_to_orig_index.append(i)
|
||||
all_doc_tokens.append(sub_token)
|
||||
|
||||
if is_training and not example.is_impossible:
|
||||
tok_start_position = orig_to_tok_index[example.start_position]
|
||||
if example.end_position < len(example.doc_tokens) - 1:
|
||||
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
|
||||
else:
|
||||
tok_end_position = len(all_doc_tokens) - 1
|
||||
|
||||
(tok_start_position, tok_end_position) = _improve_answer_span(
|
||||
all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
|
||||
)
|
||||
|
||||
spans = []
|
||||
|
||||
truncated_query = tokenizer.encode(
|
||||
example.question_text, add_special_tokens=False, max_length=max_query_length
|
||||
)
|
||||
sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
|
||||
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
|
||||
|
||||
span_doc_tokens = all_doc_tokens
|
||||
while len(spans) * doc_stride < len(all_doc_tokens):
|
||||
|
||||
encoded_dict = tokenizer.encode_plus(
|
||||
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
|
||||
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
|
||||
max_length=max_seq_length,
|
||||
return_overflowing_tokens=True,
|
||||
pad_to_max_length=True,
|
||||
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
|
||||
truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
|
||||
)
|
||||
|
||||
paragraph_len = min(
|
||||
len(all_doc_tokens) - len(spans) * doc_stride,
|
||||
max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
|
||||
)
|
||||
|
||||
if tokenizer.pad_token_id in encoded_dict["input_ids"]:
|
||||
non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
|
||||
else:
|
||||
non_padded_ids = encoded_dict["input_ids"]
|
||||
|
||||
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
|
||||
|
||||
token_to_orig_map = {}
|
||||
for i in range(paragraph_len):
|
||||
index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
|
||||
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
|
||||
|
||||
encoded_dict["paragraph_len"] = paragraph_len
|
||||
encoded_dict["tokens"] = tokens
|
||||
encoded_dict["token_to_orig_map"] = token_to_orig_map
|
||||
encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
|
||||
encoded_dict["token_is_max_context"] = {}
|
||||
encoded_dict["start"] = len(spans) * doc_stride
|
||||
encoded_dict["length"] = paragraph_len
|
||||
|
||||
spans.append(encoded_dict)
|
||||
|
||||
if "overflowing_tokens" not in encoded_dict:
|
||||
break
|
||||
span_doc_tokens = encoded_dict["overflowing_tokens"]
|
||||
|
||||
for doc_span_index in range(len(spans)):
|
||||
for j in range(spans[doc_span_index]["paragraph_len"]):
|
||||
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
|
||||
index = (
|
||||
j
|
||||
if tokenizer.padding_side == "left"
|
||||
else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
|
||||
)
|
||||
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
|
||||
|
||||
for span in spans:
|
||||
# Identify the position of the CLS token
|
||||
cls_index = span["input_ids"].index(tokenizer.cls_token_id)
|
||||
|
||||
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
|
||||
# Original TF implem also keep the classification token (set to 0) (not sure why...)
|
||||
p_mask = np.array(span["token_type_ids"])
|
||||
|
||||
p_mask = np.minimum(p_mask, 1)
|
||||
|
||||
if tokenizer.padding_side == "right":
|
||||
# Limit positive values to one
|
||||
p_mask = 1 - p_mask
|
||||
|
||||
p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
|
||||
|
||||
# Set the CLS index to '0'
|
||||
p_mask[cls_index] = 0
|
||||
|
||||
span_is_impossible = example.is_impossible
|
||||
start_position = 0
|
||||
end_position = 0
|
||||
if is_training and not span_is_impossible:
|
||||
# For training, if our document chunk does not contain an annotation
|
||||
# we throw it out, since there is nothing to predict.
|
||||
doc_start = span["start"]
|
||||
doc_end = span["start"] + span["length"] - 1
|
||||
out_of_span = False
|
||||
|
||||
if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
|
||||
out_of_span = True
|
||||
|
||||
if out_of_span:
|
||||
start_position = cls_index
|
||||
end_position = cls_index
|
||||
span_is_impossible = True
|
||||
else:
|
||||
if tokenizer.padding_side == "left":
|
||||
doc_offset = 0
|
||||
else:
|
||||
doc_offset = len(truncated_query) + sequence_added_tokens
|
||||
|
||||
start_position = tok_start_position - doc_start + doc_offset
|
||||
end_position = tok_end_position - doc_start + doc_offset
|
||||
|
||||
features.append(
|
||||
SquadFeatures(
|
||||
span["input_ids"],
|
||||
span["attention_mask"],
|
||||
span["token_type_ids"],
|
||||
cls_index,
|
||||
p_mask.tolist(),
|
||||
example_index=example_index,
|
||||
unique_id=unique_id,
|
||||
paragraph_len=span["paragraph_len"],
|
||||
token_is_max_context=span["token_is_max_context"],
|
||||
tokens=span["tokens"],
|
||||
token_to_orig_map=span["token_to_orig_map"],
|
||||
start_position=start_position,
|
||||
end_position=end_position,
|
||||
)
|
||||
)
|
||||
|
||||
unique_id += 1
|
||||
|
||||
if return_dataset == "pt":
|
||||
if not is_torch_available():
|
||||
raise ImportError("Pytorch must be installed to return a pytorch dataset.")
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||
|
||||
if not is_training:
|
||||
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
||||
dataset = TensorDataset(
|
||||
all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
|
||||
)
|
||||
else:
|
||||
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||
dataset = TensorDataset(
|
||||
all_input_ids,
|
||||
all_attention_masks,
|
||||
all_token_type_ids,
|
||||
all_start_positions,
|
||||
all_end_positions,
|
||||
all_cls_index,
|
||||
all_p_mask,
|
||||
)
|
||||
|
||||
return features, dataset
|
||||
elif return_dataset == "tf":
|
||||
if not is_tf_available():
|
||||
raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.")
|
||||
|
||||
def gen():
|
||||
for ex in features:
|
||||
yield (
|
||||
{
|
||||
"input_ids": ex.input_ids,
|
||||
"attention_mask": ex.attention_mask,
|
||||
"token_type_ids": ex.token_type_ids,
|
||||
}, {
|
||||
"start_position": ex.start_position,
|
||||
"end_position": ex.end_position,
|
||||
"cls_index": ex.cls_index,
|
||||
"p_mask": ex.p_mask,
|
||||
}
|
||||
)
|
||||
|
||||
return tf.data.Dataset.from_generator(
|
||||
gen,
|
||||
(
|
||||
{"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
|
||||
{"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
|
||||
),
|
||||
(
|
||||
{
|
||||
"input_ids": tf.TensorShape([None]),
|
||||
"attention_mask": tf.TensorShape([None]),
|
||||
"token_type_ids": tf.TensorShape([None]),
|
||||
},
|
||||
{
|
||||
"start_position": tf.TensorShape([]),
|
||||
"end_position": tf.TensorShape([]),
|
||||
"cls_index": tf.TensorShape([]),
|
||||
"p_mask": tf.TensorShape([None]),
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
class SquadProcessor(DataProcessor):
|
||||
"""
|
||||
Processor for the SQuAD data set.
|
||||
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
|
||||
"""
|
||||
|
||||
train_file = None
|
||||
dev_file = None
|
||||
|
||||
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
|
||||
if not evaluate:
|
||||
answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
|
||||
answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
|
||||
answers = []
|
||||
else:
|
||||
answers = [
|
||||
{"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
|
||||
for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
|
||||
]
|
||||
|
||||
answer = None
|
||||
answer_start = None
|
||||
|
||||
return SquadExample(
|
||||
qas_id=tensor_dict["id"].numpy().decode("utf-8"),
|
||||
question_text=tensor_dict["question"].numpy().decode("utf-8"),
|
||||
context_text=tensor_dict["context"].numpy().decode("utf-8"),
|
||||
answer_text=answer,
|
||||
start_position_character=answer_start,
|
||||
title=tensor_dict["title"].numpy().decode("utf-8"),
|
||||
answers=answers,
|
||||
)
|
||||
|
||||
def get_examples_from_dataset(self, dataset, evaluate=False):
|
||||
"""
|
||||
Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
|
||||
|
||||
Args:
|
||||
dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
|
||||
evaluate: boolean specifying if in evaluation mode or in training mode
|
||||
|
||||
Returns:
|
||||
List of SquadExample
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow_datasets as tfds
|
||||
dataset = tfds.load("squad")
|
||||
|
||||
training_examples = get_examples_from_dataset(dataset, evaluate=False)
|
||||
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
|
||||
"""
|
||||
|
||||
if evaluate:
|
||||
dataset = dataset["validation"]
|
||||
else:
|
||||
dataset = dataset["train"]
|
||||
|
||||
examples = []
|
||||
for tensor_dict in tqdm(dataset):
|
||||
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
|
||||
|
||||
return examples
|
||||
|
||||
def get_train_examples(self, data_dir, filename=None):
|
||||
"""
|
||||
Returns the training examples from the data directory.
|
||||
|
||||
Args:
|
||||
data_dir: Directory containing the data files used for training and evaluating.
|
||||
filename: None by default, specify this if the training file has a different name than the original one
|
||||
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||
|
||||
"""
|
||||
if data_dir is None:
|
||||
data_dir = ""
|
||||
|
||||
if self.train_file is None:
|
||||
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||
|
||||
with open(
|
||||
os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
|
||||
) as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
return self._create_examples(input_data, "train")
|
||||
|
||||
def get_dev_examples(self, data_dir, filename=None):
|
||||
"""
|
||||
Returns the evaluation example from the data directory.
|
||||
|
||||
Args:
|
||||
data_dir: Directory containing the data files used for training and evaluating.
|
||||
filename: None by default, specify this if the evaluation file has a different name than the original one
|
||||
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||
"""
|
||||
if data_dir is None:
|
||||
data_dir = ""
|
||||
|
||||
if self.dev_file is None:
|
||||
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||
|
||||
with open(
|
||||
os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
|
||||
) as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
return self._create_examples(input_data, "dev")
|
||||
|
||||
def _create_examples(self, input_data, set_type):
|
||||
is_training = set_type == "train"
|
||||
examples = []
|
||||
for entry in tqdm(input_data):
|
||||
title = entry["title"]
|
||||
for paragraph in entry["paragraphs"]:
|
||||
context_text = paragraph["context"]
|
||||
for qa in paragraph["qas"]:
|
||||
qas_id = qa["id"]
|
||||
question_text = qa["question"]
|
||||
start_position_character = None
|
||||
answer_text = None
|
||||
answers = []
|
||||
|
||||
if "is_impossible" in qa:
|
||||
is_impossible = qa["is_impossible"]
|
||||
else:
|
||||
is_impossible = False
|
||||
|
||||
if not is_impossible:
|
||||
if is_training:
|
||||
answer = qa["answers"][0]
|
||||
answer_text = answer["text"]
|
||||
start_position_character = answer["answer_start"]
|
||||
else:
|
||||
answers = qa["answers"]
|
||||
|
||||
example = SquadExample(
|
||||
qas_id=qas_id,
|
||||
question_text=question_text,
|
||||
context_text=context_text,
|
||||
answer_text=answer_text,
|
||||
start_position_character=start_position_character,
|
||||
title=title,
|
||||
is_impossible=is_impossible,
|
||||
answers=answers,
|
||||
)
|
||||
|
||||
examples.append(example)
|
||||
return examples
|
||||
|
||||
|
||||
class SquadV1Processor(SquadProcessor):
|
||||
train_file = "train-v1.1.json"
|
||||
dev_file = "dev-v1.1.json"
|
||||
|
||||
|
||||
class SquadV2Processor(SquadProcessor):
|
||||
train_file = "train-v2.0.json"
|
||||
dev_file = "dev-v2.0.json"
|
||||
|
||||
|
||||
class SquadExample(object):
|
||||
"""
|
||||
A single training/test example for the Squad dataset, as loaded from disk.
|
||||
|
||||
Args:
|
||||
qas_id: The example's unique identifier
|
||||
question_text: The question string
|
||||
context_text: The context string
|
||||
answer_text: The answer string
|
||||
start_position_character: The character position of the start of the answer
|
||||
title: The title of the example
|
||||
answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
|
||||
is_impossible: False by default, set to True if the example has no possible answer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
qas_id,
|
||||
question_text,
|
||||
context_text,
|
||||
answer_text,
|
||||
start_position_character,
|
||||
title,
|
||||
answers=[],
|
||||
is_impossible=False,
|
||||
):
|
||||
self.qas_id = qas_id
|
||||
self.question_text = question_text
|
||||
self.context_text = context_text
|
||||
self.answer_text = answer_text
|
||||
self.title = title
|
||||
self.is_impossible = is_impossible
|
||||
self.answers = answers
|
||||
|
||||
self.start_position, self.end_position = 0, 0
|
||||
|
||||
doc_tokens = []
|
||||
char_to_word_offset = []
|
||||
prev_is_whitespace = True
|
||||
|
||||
# Split on whitespace so that different tokens may be attributed to their original position.
|
||||
for c in self.context_text:
|
||||
if _is_whitespace(c):
|
||||
prev_is_whitespace = True
|
||||
else:
|
||||
if prev_is_whitespace:
|
||||
doc_tokens.append(c)
|
||||
else:
|
||||
doc_tokens[-1] += c
|
||||
prev_is_whitespace = False
|
||||
char_to_word_offset.append(len(doc_tokens) - 1)
|
||||
|
||||
self.doc_tokens = doc_tokens
|
||||
self.char_to_word_offset = char_to_word_offset
|
||||
|
||||
# Start end end positions only has a value during evaluation.
|
||||
if start_position_character is not None and not is_impossible:
|
||||
self.start_position = char_to_word_offset[start_position_character]
|
||||
self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
|
||||
|
||||
|
||||
class SquadFeatures(object):
|
||||
"""
|
||||
Single squad example features to be fed to a model.
|
||||
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
|
||||
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
|
||||
|
||||
Args:
|
||||
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
|
||||
cls_index: the index of the CLS token.
|
||||
p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
|
||||
Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
|
||||
example_index: the index of the example
|
||||
unique_id: The unique Feature identifier
|
||||
paragraph_len: The length of the context
|
||||
token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
|
||||
If a token does not have their maximum context in this feature object, it means that another feature object
|
||||
has more information related to that token and should be prioritized over this feature for that token.
|
||||
tokens: list of tokens corresponding to the input ids
|
||||
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
|
||||
start_position: start of the answer token index
|
||||
end_position: end of the answer token index
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_ids,
|
||||
attention_mask,
|
||||
token_type_ids,
|
||||
cls_index,
|
||||
p_mask,
|
||||
example_index,
|
||||
unique_id,
|
||||
paragraph_len,
|
||||
token_is_max_context,
|
||||
tokens,
|
||||
token_to_orig_map,
|
||||
start_position,
|
||||
end_position,
|
||||
):
|
||||
self.input_ids = input_ids
|
||||
self.attention_mask = attention_mask
|
||||
self.token_type_ids = token_type_ids
|
||||
self.cls_index = cls_index
|
||||
self.p_mask = p_mask
|
||||
|
||||
self.example_index = example_index
|
||||
self.unique_id = unique_id
|
||||
self.paragraph_len = paragraph_len
|
||||
self.token_is_max_context = token_is_max_context
|
||||
self.tokens = tokens
|
||||
self.token_to_orig_map = token_to_orig_map
|
||||
|
||||
self.start_position = start_position
|
||||
self.end_position = end_position
|
||||
|
||||
|
||||
class SquadResult(object):
|
||||
"""
|
||||
Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
|
||||
|
||||
Args:
|
||||
unique_id: The unique identifier corresponding to that example.
|
||||
start_logits: The logits corresponding to the start of the answer
|
||||
end_logits: The logits corresponding to the end of the answer
|
||||
"""
|
||||
|
||||
def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
|
||||
self.start_logits = start_logits
|
||||
self.end_logits = end_logits
|
||||
self.unique_id = unique_id
|
||||
|
||||
if start_top_index:
|
||||
self.start_top_index = start_top_index
|
||||
self.end_top_index = end_top_index
|
||||
self.cls_logits = cls_logits
|
||||
@@ -21,7 +21,7 @@ import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
from tqdm.auto import tqdm
|
||||
from contextlib import contextmanager
|
||||
|
||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||
@@ -73,6 +73,8 @@ TF2_WEIGHTS_NAME = 'tf_model.h5'
|
||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||
CONFIG_NAME = "config.json"
|
||||
|
||||
S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
|
||||
|
||||
def is_torch_available():
|
||||
return _torch_available
|
||||
|
||||
@@ -103,6 +105,18 @@ else:
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def is_remote_url(url_or_filename):
|
||||
parsed = urlparse(url_or_filename)
|
||||
return parsed.scheme in ('http', 'https', 's3')
|
||||
|
||||
def hf_bucket_url(identifier, postfix=None):
|
||||
if postfix is None:
|
||||
return "/".join((S3_BUCKET_PREFIX, identifier))
|
||||
else:
|
||||
return "/".join((S3_BUCKET_PREFIX, identifier, postfix))
|
||||
|
||||
|
||||
def url_to_filename(url, etag=None):
|
||||
"""
|
||||
Convert `url` into a hashed filename in a repeatable way.
|
||||
@@ -171,9 +185,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
parsed = urlparse(url_or_filename)
|
||||
|
||||
if parsed.scheme in ('http', 'https', 's3'):
|
||||
if is_remote_url(url_or_filename):
|
||||
# URL, so get it from the cache (downloading if necessary)
|
||||
return get_from_cache(url_or_filename, cache_dir=cache_dir,
|
||||
force_download=force_download, proxies=proxies,
|
||||
@@ -181,7 +193,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
||||
elif os.path.exists(url_or_filename):
|
||||
# File, and it exists.
|
||||
return url_or_filename
|
||||
elif parsed.scheme == '':
|
||||
elif urlparse(url_or_filename).scheme == '':
|
||||
# File, but it doesn't exist.
|
||||
raise EnvironmentError("file {} not found".format(url_or_filename))
|
||||
else:
|
||||
@@ -245,7 +257,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0):
|
||||
return
|
||||
content_length = response.headers.get('Content-Length')
|
||||
total = resume_size + int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit="B", total=total, initial=resume_size)
|
||||
progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading")
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
progress.update(len(chunk))
|
||||
|
||||
228
transformers/hf_api.py
Normal file
228
transformers/hf_api.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
from os.path import expanduser
|
||||
|
||||
import requests
|
||||
import six
|
||||
from requests.exceptions import HTTPError
|
||||
from tqdm import tqdm
|
||||
|
||||
ENDPOINT = "https://huggingface.co"
|
||||
|
||||
class S3Obj:
|
||||
def __init__(
|
||||
self,
|
||||
filename, # type: str
|
||||
LastModified, # type: str
|
||||
ETag, # type: str
|
||||
Size, # type: int
|
||||
**kwargs
|
||||
):
|
||||
self.filename = filename
|
||||
self.LastModified = LastModified
|
||||
self.ETag = ETag
|
||||
self.Size = Size
|
||||
|
||||
|
||||
class PresignedUrl:
|
||||
def __init__(
|
||||
self,
|
||||
write, # type: str
|
||||
access, # type: str
|
||||
type, # type: str
|
||||
**kwargs
|
||||
):
|
||||
self.write = write
|
||||
self.access = access
|
||||
self.type = type # mime-type to send to S3.
|
||||
|
||||
|
||||
class HfApi:
|
||||
def __init__(self, endpoint=None):
|
||||
self.endpoint = endpoint if endpoint is not None else ENDPOINT
|
||||
|
||||
def login(
|
||||
self,
|
||||
username, # type: str
|
||||
password, # type: str
|
||||
):
|
||||
# type: (...) -> str
|
||||
"""
|
||||
Call HF API to sign in a user and get a token if credentials are valid.
|
||||
|
||||
Outputs:
|
||||
token if credentials are valid
|
||||
|
||||
Throws:
|
||||
requests.exceptions.HTTPError if credentials are invalid
|
||||
"""
|
||||
path = "{}/api/login".format(self.endpoint)
|
||||
r = requests.post(path, json={"username": username, "password": password})
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return d["token"]
|
||||
|
||||
def whoami(
|
||||
self,
|
||||
token, # type: str
|
||||
):
|
||||
# type: (...) -> str
|
||||
"""
|
||||
Call HF API to know "whoami"
|
||||
"""
|
||||
path = "{}/api/whoami".format(self.endpoint)
|
||||
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return d["user"]
|
||||
|
||||
def logout(self, token):
|
||||
# type: (...) -> void
|
||||
"""
|
||||
Call HF API to log out.
|
||||
"""
|
||||
path = "{}/api/logout".format(self.endpoint)
|
||||
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
|
||||
r.raise_for_status()
|
||||
|
||||
def presign(self, token, filename):
|
||||
# type: (...) -> PresignedUrl
|
||||
"""
|
||||
Call HF API to get a presigned url to upload `filename` to S3.
|
||||
"""
|
||||
path = "{}/api/presign".format(self.endpoint)
|
||||
r = requests.post(
|
||||
path,
|
||||
headers={"authorization": "Bearer {}".format(token)},
|
||||
json={"filename": filename},
|
||||
)
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return PresignedUrl(**d)
|
||||
|
||||
def presign_and_upload(self, token, filename, filepath):
|
||||
# type: (...) -> str
|
||||
"""
|
||||
Get a presigned url, then upload file to S3.
|
||||
|
||||
Outputs:
|
||||
url: Read-only url for the stored file on S3.
|
||||
"""
|
||||
urls = self.presign(token, filename=filename)
|
||||
# streaming upload:
|
||||
# https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
|
||||
#
|
||||
# Even though we presign with the correct content-type,
|
||||
# the client still has to specify it when uploading the file.
|
||||
with open(filepath, "rb") as f:
|
||||
pf = TqdmProgressFileReader(f)
|
||||
|
||||
r = requests.put(urls.write, data=f, headers={
|
||||
"content-type": urls.type,
|
||||
})
|
||||
r.raise_for_status()
|
||||
pf.close()
|
||||
return urls.access
|
||||
|
||||
def list_objs(self, token):
|
||||
# type: (...) -> List[S3Obj]
|
||||
"""
|
||||
Call HF API to list all stored files for user.
|
||||
"""
|
||||
path = "{}/api/listObjs".format(self.endpoint)
|
||||
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return [S3Obj(**x) for x in d]
|
||||
|
||||
|
||||
|
||||
class TqdmProgressFileReader:
|
||||
"""
|
||||
Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
|
||||
and override `f.read()` so as to display a tqdm progress bar.
|
||||
|
||||
see github.com/huggingface/transformers/pull/2078#discussion_r354739608
|
||||
for implementation details.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
f # type: io.BufferedReader
|
||||
):
|
||||
self.f = f
|
||||
self.total_size = os.fstat(f.fileno()).st_size # type: int
|
||||
self.pbar = tqdm(total=self.total_size, leave=False)
|
||||
if six.PY3:
|
||||
# does not work unless PY3
|
||||
# no big deal as the CLI does not currently support PY2 anyways.
|
||||
self.read = f.read
|
||||
f.read = self._read
|
||||
|
||||
def _read(self, n=-1):
|
||||
self.pbar.update(n)
|
||||
return self.read(n)
|
||||
|
||||
def close(self):
|
||||
self.pbar.close()
|
||||
|
||||
|
||||
|
||||
class HfFolder:
|
||||
path_token = expanduser("~/.huggingface/token")
|
||||
|
||||
@classmethod
|
||||
def save_token(cls, token):
|
||||
"""
|
||||
Save token, creating folder as needed.
|
||||
"""
|
||||
if six.PY3:
|
||||
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
||||
else:
|
||||
# Python 2
|
||||
try:
|
||||
os.makedirs(os.path.dirname(cls.path_token))
|
||||
except OSError as e:
|
||||
if e.errno != os.errno.EEXIST:
|
||||
raise e
|
||||
pass
|
||||
with open(cls.path_token, 'w+') as f:
|
||||
f.write(token)
|
||||
|
||||
@classmethod
|
||||
def get_token(cls):
|
||||
"""
|
||||
Get token or None if not existent.
|
||||
"""
|
||||
try:
|
||||
with open(cls.path_token, 'r') as f:
|
||||
return f.read()
|
||||
except:
|
||||
# this is too wide. When Py2 is dead use:
|
||||
# `except FileNotFoundError:` instead
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def delete_token(cls):
|
||||
"""
|
||||
Delete token.
|
||||
Do not fail if token does not exist.
|
||||
"""
|
||||
try:
|
||||
os.remove(cls.path_token)
|
||||
except:
|
||||
return
|
||||
@@ -28,7 +28,6 @@ from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassifica
|
||||
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
||||
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
||||
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
|
||||
|
||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||
@@ -94,6 +93,7 @@ class AutoModel(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
@@ -232,6 +232,7 @@ class AutoModelWithLMHead(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
@@ -361,6 +362,7 @@ class AutoModelForSequenceClassification(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
@@ -479,6 +481,7 @@ class AutoModelForQuestionAnswering(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
|
||||
@@ -1,271 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019 Yang Liu
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
"""
|
||||
A general wrapper around models with LM heads to generate sequences
|
||||
using beam search.
|
||||
"""
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class TransformerBeamSearch(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
tokenizer,
|
||||
batch_size,
|
||||
beam_size,
|
||||
min_length,
|
||||
max_length,
|
||||
alpha=0,
|
||||
block_repeating_trigram=True,
|
||||
):
|
||||
"""
|
||||
Attributes:
|
||||
mask_word_id: token id that corresponds to the mask
|
||||
"""
|
||||
super(TransformerBeamSearch, self).__init__()
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.start_token_id = tokenizer.start_token_id
|
||||
self.end_token_id = tokenizer.end_token_id
|
||||
self.pad_token_id = tokenizer.pad_token_id
|
||||
|
||||
self.beam_size = beam_size
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
|
||||
self.block_repeating_trigram = block_repeating_trigram
|
||||
self.apply_length_penalty = False if alpha == 0 else True
|
||||
self.alpha = alpha
|
||||
|
||||
# State of the beam
|
||||
self.hypotheses = [[] for _ in range(batch_size)]
|
||||
self.batch_offset = torch.arange(batch_size, dtype=torch.long)
|
||||
self.beam_offset = torch.arange(
|
||||
0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
|
||||
)
|
||||
self.growing_beam = torch.full(
|
||||
(batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
|
||||
)
|
||||
self.topk_log_probabilities = torch.tensor(
|
||||
[0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
|
||||
).repeat(batch_size)
|
||||
self.results = {
|
||||
"prediction": [[] for _ in batch_size],
|
||||
"scores": [[] for _ in batch_size],
|
||||
}
|
||||
self._step = 0
|
||||
self.is_done = False
|
||||
|
||||
def step(self, log_probabilities):
|
||||
""" Grows the beam by one step. """
|
||||
self._step += 1
|
||||
|
||||
# The batch size changes as some beams finish so we define _B
|
||||
vocab_size = log_probabilities.size(-1)
|
||||
_B = log_probabilities.size(0) // self.beam_size
|
||||
|
||||
# Multiply each beam probability with the probability of the
|
||||
# next token (conditioned on the words in the beam).
|
||||
log_probabilities += self.topk_log_probabilities.view(-1, 1)
|
||||
|
||||
self.enforce_min_length(log_probabilities)
|
||||
if self.block_repeating_trigram:
|
||||
self.remove_repeating_trigrams(log_probabilities, _B)
|
||||
|
||||
# Find the `beam_size` (previous_beam + token) combinations with
|
||||
# the highest score
|
||||
topk_log_probabilities, topk_ids = log_probabilities.topk(
|
||||
log_probabilities.view(_B, self.beam_size * vocab_size),
|
||||
self.beam_size,
|
||||
dim=1,
|
||||
)
|
||||
|
||||
# Apply the length penalty. The +1 accounts for the [EOS] token
|
||||
# that will be added if the beam ends.
|
||||
topk_scores = topk_log_probabilities / self.length_penalty()
|
||||
|
||||
# Retrieve the corresponding respective beam and token id
|
||||
# topk_token_ids[i] will be added to topk_beam_ids[i]
|
||||
topk_beam_ids = topk_ids.div(vocab_size)
|
||||
topk_token_ids = topk_ids.fmod(vocab_size)
|
||||
|
||||
# Retrieve the row index of the surviving beams in the original
|
||||
# view of the log_probabilities tensor
|
||||
surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view(
|
||||
-1
|
||||
)
|
||||
|
||||
# Append the last predictions
|
||||
self.growing_beam = torch.cat(
|
||||
[
|
||||
self.growing_beam.index_select(0, surviving_beams_rows),
|
||||
topk_token_ids.view(-1, 1),
|
||||
],
|
||||
1,
|
||||
)
|
||||
|
||||
# Check if any of the beam searches has ended during this
|
||||
# growth step. Also if top beam (most probable) has ended
|
||||
# for one element of the batch.
|
||||
is_finished = topk_token_ids.eq(self.end_token_id)
|
||||
self.enforce_max_length()
|
||||
is_top_beam_finished = is_finished[:, 0].eq(1)
|
||||
|
||||
# Save the finished searches
|
||||
if is_finished.any():
|
||||
predictions = self.growing_beam.view(
|
||||
-1, self.beam_size, self.growing_beam.size(1)
|
||||
)
|
||||
for i in range(is_finished.size(0)):
|
||||
if is_top_beam_finished[i]:
|
||||
is_finished[i].fill_(1)
|
||||
finished_hyp = is_finished[i].nonzero().view(-1)
|
||||
|
||||
# Store finished hypotheses for this batch.
|
||||
b = self.batch_offset[i]
|
||||
for j in finished_hyp:
|
||||
self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
|
||||
|
||||
# If the batch reached the end, save the best hypotheses
|
||||
# in terms of length-penalized score.
|
||||
if is_top_beam_finished[i]:
|
||||
best_hyp = sorted(
|
||||
self.hypotheses[b], key=lambda x: x[0], reverse=True
|
||||
)
|
||||
best_score, best_prediction = best_hyp[0]
|
||||
self.results["scores"][b].append(best_score)
|
||||
self.results["predictions"][b].append(best_prediction)
|
||||
|
||||
non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
|
||||
if len(non_finished) == 0:
|
||||
self.is_done = True
|
||||
|
||||
# Remove finished batches for the next step.
|
||||
topk_log_probabilities = topk_log_probabilities.index_select(
|
||||
0, non_finished
|
||||
)
|
||||
self.batch_offset = self.batch_offset.index_select(0, non_finished)
|
||||
self.growing_beam = predictions.index_select(0, non_finished).view(
|
||||
-1, self.growing_beam.size(-1)
|
||||
)
|
||||
|
||||
surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
|
||||
|
||||
return surviving_beams_rows
|
||||
|
||||
def forward(self, encoder_input_ids, **kwargs):
|
||||
# keyword arguments come in 3 flavors: encoder-specific (prefixed by
|
||||
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
|
||||
# that apply to the model as whole.
|
||||
# We let the specific kwargs override the common ones in case of conflict.
|
||||
kwargs_encoder = {
|
||||
argument[len("encoder_"):]: value
|
||||
for argument, value in kwargs.items()
|
||||
if argument.startswith("encoder_")
|
||||
}
|
||||
kwargs_decoder = {
|
||||
argument[len("decoder_"):]: value
|
||||
for argument, value in kwargs.items()
|
||||
if argument.startswith("decoder_")
|
||||
}
|
||||
kwargs_common = {
|
||||
argument: value
|
||||
for argument, value in kwargs.items()
|
||||
if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
|
||||
}
|
||||
kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
|
||||
kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
|
||||
|
||||
# forward pass on the encoder
|
||||
encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
|
||||
kwargs_decoder["encoder_hidden_states"] = tile(
|
||||
encoder_outputs, self.beam_size, dim=0
|
||||
)
|
||||
|
||||
# grow the beam by generating sequences in an autoregressive way
|
||||
self.growing_beam = torch.full(
|
||||
(self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
|
||||
)
|
||||
for step in range(self.max_length):
|
||||
decoder_input = self.growing_beam[:, -1]
|
||||
outputs = self.model.decoder(decoder_input, kwargs_decoder)
|
||||
log_probabilities = torch.nn.functional.log_softmax(outputs[1])
|
||||
surviving_beams_rows = self.step(log_probabilities)
|
||||
if self.is_done:
|
||||
break
|
||||
|
||||
kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
|
||||
"encoder_hidden_states"
|
||||
].index_select(0, surviving_beams_rows)
|
||||
|
||||
return self.results
|
||||
|
||||
def remove_repeating_trigrams(self, log_probabilities, _B):
|
||||
if(self._step + 1 > 3):
|
||||
for i in range(_B * self.beam_size):
|
||||
tokens = [t for t in self.growing_beam[i]]
|
||||
trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
|
||||
last_trigram = tuple(trigrams[-1])
|
||||
if last_trigram in trigrams[:-1]:
|
||||
log_probabilities[i] = -1e20
|
||||
|
||||
def enforce_min_length(self):
|
||||
if self._step < self.min_length:
|
||||
self.log_probabilities[self.end_token_id] = -1e20
|
||||
|
||||
def enforce_max_length(self):
|
||||
if self._step + 1 == self.max_length:
|
||||
self.is_finished.fill_(1)
|
||||
|
||||
def length_penalty(self):
|
||||
return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
|
||||
|
||||
|
||||
def tile(x, count, dim=0):
|
||||
"""
|
||||
Tiles `x` along dimension `dim` `count` times.
|
||||
|
||||
Example:
|
||||
>> ex = torch.tensor([1,2],[3,4])
|
||||
>> tile(ex, 2, 0)
|
||||
torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
|
||||
"""
|
||||
perm = list(range(len(x.size())))
|
||||
if dim != 0:
|
||||
perm[0], perm[dim] = perm[dim], perm[0]
|
||||
x = x.permute(perm).contiguous()
|
||||
out_size = list(x.size())
|
||||
out_size[0] *= count
|
||||
batch = x.size(0)
|
||||
x = (
|
||||
x.view(batch, -1)
|
||||
.transpose(0, 1)
|
||||
.repeat(count, 1)
|
||||
.transpose(0, 1)
|
||||
.contiguous()
|
||||
.view(*out_size)
|
||||
)
|
||||
if dim != 0:
|
||||
x = x.permute(perm).contiguous()
|
||||
return x
|
||||
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
|
||||
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
|
||||
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
|
||||
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
|
||||
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
|
||||
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
|
||||
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
|
||||
}
|
||||
|
||||
|
||||
@@ -667,18 +671,20 @@ class BertModel(BertPreTrainedModel):
|
||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||
if attention_mask.dim() == 3:
|
||||
extended_attention_mask = attention_mask[:, None, :, :]
|
||||
|
||||
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if attention_mask.dim() == 2:
|
||||
elif attention_mask.dim() == 2:
|
||||
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder:
|
||||
batch_size, seq_length = input_shape
|
||||
seq_ids = torch.arange(seq_length, device=device)
|
||||
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
|
||||
causal_mask = causal_mask.to(torch.long) # not converting to long will cause errors with pytorch version < 1.3
|
||||
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
||||
else:
|
||||
extended_attention_mask = attention_mask[:, None, None, :]
|
||||
else:
|
||||
raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||
# masked positions, this operation will create a tensor which is 0.0 for
|
||||
@@ -690,14 +696,19 @@ class BertModel(BertPreTrainedModel):
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder:
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
if encoder_attention_mask is None:
|
||||
encoder_attention_mask = torch.ones(input_shape, device=device)
|
||||
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
|
||||
|
||||
if encoder_attention_mask.dim() == 3:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||
if encoder_attention_mask.dim() == 2:
|
||||
elif encoder_attention_mask.dim() == 2:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
||||
else:
|
||||
raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
|
||||
encoder_attention_mask.shape))
|
||||
|
||||
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
|
||||
@@ -1226,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
|
||||
input_ids = tokenizer.encode(input_text)
|
||||
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
|
||||
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
|
||||
# a nice puppet
|
||||
|
||||
|
||||
@@ -252,7 +252,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
@@ -438,7 +438,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
|
||||
@@ -59,12 +59,14 @@ class PreTrainedEncoderDecoder(nn.Module):
|
||||
encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
|
||||
@@ -329,7 +329,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
@@ -503,7 +503,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
@@ -596,7 +596,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
|
||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
|
||||
@@ -50,8 +50,10 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
||||
|
||||
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
|
||||
|
||||
names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
|
||||
shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
|
||||
with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
|
||||
names = json.load(names_handle)
|
||||
with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
|
||||
shapes = json.load(shapes_handle)
|
||||
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
||||
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
|
||||
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
||||
|
||||
@@ -81,6 +81,7 @@ class TFAutoModel(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||
|
||||
@@ -212,6 +213,7 @@ class TFAutoModelWithLMHead(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||
|
||||
@@ -338,6 +340,7 @@ class TFAutoModelForSequenceClassification(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||
|
||||
@@ -453,6 +456,7 @@ class TFAutoModelForQuestionAnswering(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||
|
||||
|
||||
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
|
||||
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
|
||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
|
||||
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
|
||||
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
|
||||
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
|
||||
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5"
|
||||
}
|
||||
|
||||
|
||||
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
||||
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||
Raises:
|
||||
ValueError: if mode is not valid.
|
||||
|
||||
|
||||
Shared weights logic adapted from
|
||||
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||
"""
|
||||
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
||||
input_shape = shape_list(input_ids)
|
||||
else:
|
||||
input_shape = shape_list(inputs_embeds)[:-1]
|
||||
|
||||
|
||||
seq_length = input_shape[1]
|
||||
if position_ids is None:
|
||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(context_layer,
|
||||
context_layer = tf.reshape(context_layer,
|
||||
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||
|
||||
outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
|
||||
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
|
||||
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||
|
||||
Parameters:
|
||||
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
|
||||
(a) For sequence pairs:
|
||||
|
||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||
|
||||
(b) For single sequences:
|
||||
|
||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0``
|
||||
|
||||
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
|
||||
@@ -400,7 +400,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
|
||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
@@ -462,7 +462,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
|
||||
@@ -37,7 +37,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
|
||||
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
|
||||
}
|
||||
|
||||
|
||||
@@ -703,6 +704,53 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
||||
return outputs # logits, (hidden_states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||
Classification scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
Examples::
|
||||
import tensorflow as tf
|
||||
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = TFDistilBertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
scores = outputs[0]
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name='classifier')
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
outputs = self.distilbert(inputs, **kwargs)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
return outputs # scores, (hidden_states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
|
||||
@@ -436,7 +436,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
|
||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
@@ -476,7 +476,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
|
||||
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
@@ -535,7 +535,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
||||
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
|
||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||
**past**:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
|
||||
@@ -24,7 +24,8 @@ import os
|
||||
import tensorflow as tf
|
||||
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
|
||||
from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
|
||||
cached_path, hf_bucket_url, is_remote_url)
|
||||
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -51,7 +52,15 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
config_class = None
|
||||
pretrained_model_archive_map = {}
|
||||
base_model_prefix = ""
|
||||
dummy_inputs = tf.constant(DUMMY_INPUTS) # dummy inputs to build the network
|
||||
|
||||
@property
|
||||
def dummy_inputs(self):
|
||||
""" Dummy inputs to build the network.
|
||||
|
||||
Returns:
|
||||
tf.Tensor with dummy inputs
|
||||
"""
|
||||
return tf.constant(DUMMY_INPUTS)
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
@@ -168,6 +177,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
|
||||
|
||||
@@ -249,10 +259,14 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
|
||||
[WEIGHTS_NAME, TF2_WEIGHTS_NAME],
|
||||
pretrained_model_name_or_path))
|
||||
elif os.path.isfile(pretrained_model_name_or_path):
|
||||
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||
archive_file = pretrained_model_name_or_path
|
||||
elif os.path.isfile(pretrained_model_name_or_path + ".index"):
|
||||
archive_file = pretrained_model_name_or_path + ".index"
|
||||
else:
|
||||
raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path))
|
||||
archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
|
||||
if from_pt:
|
||||
raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
|
||||
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
|
||||
@@ -938,6 +938,59 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
|
||||
return outputs # return logits, (mems), (hidden states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||
Classification scores (before SoftMax).
|
||||
**mems**: (`optional`, returned when ``config.mem_len > 0``)
|
||||
list of ``tf.Tensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from transformers import XLNetTokenizer, TFXLNetForTokenClassification
|
||||
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
scores = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.transformer = TFXLNetMainLayer(config, name='transformer')
|
||||
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name='classifier')
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
transformer_outputs = self.transformer(inputs, **kwargs)
|
||||
output = transformer_outputs[0]
|
||||
|
||||
logits = self.classifier(output)
|
||||
|
||||
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
|
||||
|
||||
return outputs # return logits, (mems), (hidden states), (attentions)
|
||||
|
||||
|
||||
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||
# the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||
|
||||
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
|
||||
from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
|
||||
cached_path, hf_bucket_url, is_remote_url)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -265,6 +266,7 @@ class PreTrainedModel(nn.Module):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
- None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
|
||||
@@ -318,7 +320,8 @@ class PreTrainedModel(nn.Module):
|
||||
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
|
||||
if pretrained_model_name_or_path is not None and (
|
||||
"albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path):
|
||||
logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
|
||||
"https://github.com/google-research/google-research/issues/119 for more information.")
|
||||
|
||||
@@ -362,11 +365,16 @@ class PreTrainedModel(nn.Module):
|
||||
raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
|
||||
[WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
|
||||
pretrained_model_name_or_path))
|
||||
elif os.path.isfile(pretrained_model_name_or_path):
|
||||
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||
archive_file = pretrained_model_name_or_path
|
||||
else:
|
||||
assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
|
||||
elif os.path.isfile(pretrained_model_name_or_path + ".index"):
|
||||
assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
|
||||
pretrained_model_name_or_path + ".index")
|
||||
archive_file = pretrained_model_name_or_path + ".index"
|
||||
else:
|
||||
archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
|
||||
if from_tf:
|
||||
raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
|
||||
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
@@ -740,7 +748,7 @@ class SequenceSummary(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(SequenceSummary, self).__init__()
|
||||
|
||||
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
|
||||
self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
|
||||
if self.summary_type == 'attn':
|
||||
# We should use a standard multi-head attention module with absolute positional embedding for that.
|
||||
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
|
||||
|
||||
@@ -583,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -878,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
|
||||
outputs = outputs + (hidden_states,)
|
||||
if self.output_attentions:
|
||||
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
|
||||
if target_mapping is not None:
|
||||
# when target_mapping is provided, there are 2-tuple of attentions
|
||||
attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
|
||||
else:
|
||||
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
|
||||
outputs = outputs + (attentions,)
|
||||
|
||||
return outputs # outputs, (new_mems), (hidden_states), (attentions)
|
||||
@@ -913,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -995,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -1046,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
|
||||
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
|
||||
|
||||
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||
XLNET_START_DOCSTRING,
|
||||
XLNET_INPUTS_DOCSTRING)
|
||||
class XLNetForTokenClassification(XLNetPreTrainedModel):
|
||||
r"""
|
||||
Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
The second dimension of the input (`num_choices`) indicates the number of choices to scores.
|
||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Segment token indices to indicate first and second portions of the inputs.
|
||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||
than the model's internal embedding lookup matrix.
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification loss.
|
||||
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||
Classification scores (before SoftMax).
|
||||
**mems**: (`optional`, returned when ``config.mem_len > 0``)
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
scores = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(XLNetForTokenClassification, self).__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.transformer = XLNetModel(config)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||
|
||||
outputs = self.transformer(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
mems=mems,
|
||||
perm_mask=perm_mask,
|
||||
target_mapping=target_mapping,
|
||||
token_type_ids=token_type_ids,
|
||||
input_mask=input_mask,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
# Only keep active parts of the loss
|
||||
if attention_mask is not None:
|
||||
active_loss = attention_mask.view(-1) == 1
|
||||
active_logits = logits.view(-1, self.num_labels)[active_loss]
|
||||
active_labels = labels.view(-1)[active_loss]
|
||||
loss = loss_fct(active_logits, active_labels)
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
|
||||
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
|
||||
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||
@@ -1095,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -1180,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -1294,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
|
||||
254
transformers/optimization_tf.py
Normal file
254
transformers/optimization_tf.py
Normal file
@@ -0,0 +1,254 @@
|
||||
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Functions and classes related to optimization (weight updates)."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
|
||||
"""Applys a warmup schedule on a given learning rate decay schedule."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
initial_learning_rate,
|
||||
decay_schedule_fn,
|
||||
warmup_steps,
|
||||
power=1.0,
|
||||
name=None):
|
||||
super(WarmUp, self).__init__()
|
||||
self.initial_learning_rate = initial_learning_rate
|
||||
self.warmup_steps = warmup_steps
|
||||
self.power = power
|
||||
self.decay_schedule_fn = decay_schedule_fn
|
||||
self.name = name
|
||||
|
||||
def __call__(self, step):
|
||||
with tf.name_scope(self.name or 'WarmUp') as name:
|
||||
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
|
||||
# learning rate will be `global_step/num_warmup_steps * init_lr`.
|
||||
global_step_float = tf.cast(step, tf.float32)
|
||||
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
|
||||
warmup_percent_done = global_step_float / warmup_steps_float
|
||||
warmup_learning_rate = (
|
||||
self.initial_learning_rate *
|
||||
tf.math.pow(warmup_percent_done, self.power))
|
||||
return tf.cond(global_step_float < warmup_steps_float,
|
||||
lambda: warmup_learning_rate,
|
||||
lambda: self.decay_schedule_fn(step),
|
||||
name=name)
|
||||
|
||||
def get_config(self):
|
||||
return {
|
||||
'initial_learning_rate': self.initial_learning_rate,
|
||||
'decay_schedule_fn': self.decay_schedule_fn,
|
||||
'warmup_steps': self.warmup_steps,
|
||||
'power': self.power,
|
||||
'name': self.name
|
||||
}
|
||||
|
||||
|
||||
def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
|
||||
"""Creates an optimizer with learning rate schedule."""
|
||||
# Implements linear decay of the learning rate.
|
||||
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
|
||||
initial_learning_rate=init_lr,
|
||||
decay_steps=num_train_steps,
|
||||
end_learning_rate=0.0)
|
||||
if num_warmup_steps:
|
||||
learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
|
||||
decay_schedule_fn=learning_rate_fn,
|
||||
warmup_steps=num_warmup_steps)
|
||||
optimizer = AdamWeightDecay(
|
||||
learning_rate=learning_rate_fn,
|
||||
weight_decay_rate=0.01,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-6,
|
||||
exclude_from_weight_decay=['layer_norm', 'bias'])
|
||||
return optimizer
|
||||
|
||||
|
||||
class AdamWeightDecay(tf.keras.optimizers.Adam):
|
||||
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
|
||||
|
||||
Just adding the square of the weights to the loss function is *not* the
|
||||
correct way of using L2 regularization/weight decay with Adam, since that will
|
||||
interact with the m and v parameters in strange ways.
|
||||
|
||||
Instead we want ot decay the weights in a manner that doesn't interact with
|
||||
the m/v parameters. This is equivalent to adding the square of the weights to
|
||||
the loss with plain (non-momentum) SGD.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-7,
|
||||
amsgrad=False,
|
||||
weight_decay_rate=0.0,
|
||||
include_in_weight_decay=None,
|
||||
exclude_from_weight_decay=None,
|
||||
name='AdamWeightDecay',
|
||||
**kwargs):
|
||||
super(AdamWeightDecay, self).__init__(
|
||||
learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
|
||||
self.weight_decay_rate = weight_decay_rate
|
||||
self._include_in_weight_decay = include_in_weight_decay
|
||||
self._exclude_from_weight_decay = exclude_from_weight_decay
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
"""Creates an optimizer from its config with WarmUp custom object."""
|
||||
custom_objects = {'WarmUp': WarmUp}
|
||||
return super(AdamWeightDecay, cls).from_config(
|
||||
config, custom_objects=custom_objects)
|
||||
|
||||
def _prepare_local(self, var_device, var_dtype, apply_state):
|
||||
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
|
||||
apply_state)
|
||||
apply_state['weight_decay_rate'] = tf.constant(
|
||||
self.weight_decay_rate, name='adam_weight_decay_rate')
|
||||
|
||||
def _decay_weights_op(self, var, learning_rate, apply_state):
|
||||
do_decay = self._do_use_weight_decay(var.name)
|
||||
if do_decay:
|
||||
return var.assign_sub(
|
||||
learning_rate * var *
|
||||
apply_state['weight_decay_rate'],
|
||||
use_locking=self._use_locking)
|
||||
return tf.no_op()
|
||||
|
||||
def apply_gradients(self, grads_and_vars, clip_norm, name=None):
|
||||
grads, tvars = list(zip(*grads_and_vars))
|
||||
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
|
||||
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
|
||||
|
||||
def _get_lr(self, var_device, var_dtype, apply_state):
|
||||
"""Retrieves the learning rate with the given state."""
|
||||
if apply_state is None:
|
||||
return self._decayed_lr_t[var_dtype], {}
|
||||
|
||||
apply_state = apply_state or {}
|
||||
coefficients = apply_state.get((var_device, var_dtype))
|
||||
if coefficients is None:
|
||||
coefficients = self._fallback_apply_state(var_device, var_dtype)
|
||||
apply_state[(var_device, var_dtype)] = coefficients
|
||||
|
||||
return coefficients['lr_t'], dict(apply_state=apply_state)
|
||||
|
||||
def _resource_apply_dense(self, grad, var, apply_state=None):
|
||||
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||
with tf.control_dependencies([decay]):
|
||||
return super(AdamWeightDecay, self)._resource_apply_dense(
|
||||
grad, var, **kwargs)
|
||||
|
||||
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
|
||||
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||
with tf.control_dependencies([decay]):
|
||||
return super(AdamWeightDecay, self)._resource_apply_sparse(
|
||||
grad, var, indices, **kwargs)
|
||||
|
||||
def get_config(self):
|
||||
config = super(AdamWeightDecay, self).get_config()
|
||||
config.update({
|
||||
'weight_decay_rate': self.weight_decay_rate,
|
||||
})
|
||||
return config
|
||||
|
||||
def _do_use_weight_decay(self, param_name):
|
||||
"""Whether to use L2 weight decay for `param_name`."""
|
||||
if self.weight_decay_rate == 0:
|
||||
return False
|
||||
|
||||
if self._include_in_weight_decay:
|
||||
for r in self._include_in_weight_decay:
|
||||
if re.search(r, param_name) is not None:
|
||||
return True
|
||||
|
||||
if self._exclude_from_weight_decay:
|
||||
for r in self._exclude_from_weight_decay:
|
||||
if re.search(r, param_name) is not None:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
|
||||
class GradientAccumulator(object):
|
||||
"""Distribution strategies-aware gradient accumulation utility."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the accumulator."""
|
||||
self._gradients = []
|
||||
self._accum_steps = tf.Variable(
|
||||
initial_value=0,
|
||||
dtype=tf.int64,
|
||||
trainable=False,
|
||||
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
|
||||
|
||||
@property
|
||||
def step(self):
|
||||
"""Number of accumulated steps."""
|
||||
return self._accum_steps.value()
|
||||
|
||||
@property
|
||||
def gradients(self):
|
||||
"""The accumulated gradients."""
|
||||
return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
|
||||
|
||||
def __call__(self, gradients):
|
||||
"""Accumulates :obj:`gradients`."""
|
||||
if not self._gradients:
|
||||
self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
|
||||
|
||||
if len(gradients) != len(self._gradients):
|
||||
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
|
||||
|
||||
for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
|
||||
if accum_gradient is not None:
|
||||
accum_gradient.assign_add(gradient)
|
||||
|
||||
self._accum_steps.assign_add(1)
|
||||
|
||||
def reset(self):
|
||||
"""Resets the accumulated gradients."""
|
||||
if self._gradients:
|
||||
self._accum_steps.assign(0)
|
||||
|
||||
for gradient in self._get_replica_gradients():
|
||||
if gradient is not None:
|
||||
gradient.assign(tf.zeros_like(gradient))
|
||||
|
||||
def _get_replica_gradients(self):
|
||||
if tf.distribute.has_strategy():
|
||||
# In a replica context, we want to accumulate gradients on each replica
|
||||
# without synchronization, so we directly assign the value of the
|
||||
# current replica.
|
||||
replica_context = tf.distribute.get_replica_context()
|
||||
|
||||
if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
|
||||
return self._gradients
|
||||
|
||||
return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
|
||||
else:
|
||||
return self._gradients
|
||||
@@ -1,31 +0,0 @@
|
||||
# content of conftest.py
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--runslow", action="store_true", default=False, help="run slow tests"
|
||||
)
|
||||
parser.addoption(
|
||||
"--use_cuda", action="store_true", default=False, help="run tests on gpu"
|
||||
)
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "slow: mark test as slow to run")
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
if config.getoption("--runslow"):
|
||||
# --runslow given in cli: do not skip slow tests
|
||||
return
|
||||
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
|
||||
for item in items:
|
||||
if "slow" in item.keywords:
|
||||
item.add_marker(skip_slow)
|
||||
|
||||
@pytest.fixture
|
||||
def use_cuda(request):
|
||||
""" Run test on gpu """
|
||||
return request.config.getoption("--use_cuda")
|
||||
102
transformers/tests/hf_api_test.py
Normal file
102
transformers/tests/hf_api_test.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
import six
|
||||
import time
|
||||
import unittest
|
||||
|
||||
from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
|
||||
|
||||
USER = "__DUMMY_TRANSFORMERS_USER__"
|
||||
PASS = "__DUMMY_TRANSFORMERS_PASS__"
|
||||
FILE_KEY = "Test-{}.txt".format(int(time.time()))
|
||||
FILE_PATH = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
|
||||
)
|
||||
|
||||
|
||||
|
||||
class HfApiCommonTest(unittest.TestCase):
|
||||
_api = HfApi(endpoint="https://moon-staging.huggingface.co")
|
||||
|
||||
|
||||
class HfApiLoginTest(HfApiCommonTest):
|
||||
def test_login_invalid(self):
|
||||
with self.assertRaises(HTTPError):
|
||||
self._api.login(username=USER, password="fake")
|
||||
|
||||
def test_login_valid(self):
|
||||
token = self._api.login(username=USER, password=PASS)
|
||||
self.assertIsInstance(token, six.string_types)
|
||||
|
||||
|
||||
class HfApiEndpointsTest(HfApiCommonTest):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Share this valid token in all tests below.
|
||||
"""
|
||||
cls._token = cls._api.login(username=USER, password=PASS)
|
||||
|
||||
def test_whoami(self):
|
||||
user = self._api.whoami(token=self._token)
|
||||
self.assertEqual(user, USER)
|
||||
|
||||
def test_presign(self):
|
||||
urls = self._api.presign(token=self._token, filename=FILE_KEY)
|
||||
self.assertIsInstance(urls, PresignedUrl)
|
||||
self.assertEqual(urls.type, "text/plain")
|
||||
|
||||
def test_presign_and_upload(self):
|
||||
access_url = self._api.presign_and_upload(
|
||||
token=self._token, filename=FILE_KEY, filepath=FILE_PATH
|
||||
)
|
||||
self.assertIsInstance(access_url, six.string_types)
|
||||
|
||||
def test_list_objs(self):
|
||||
objs = self._api.list_objs(token=self._token)
|
||||
self.assertIsInstance(objs, list)
|
||||
if len(objs) > 0:
|
||||
o = objs[-1]
|
||||
self.assertIsInstance(o, S3Obj)
|
||||
|
||||
|
||||
|
||||
class HfFolderTest(unittest.TestCase):
|
||||
def test_token_workflow(self):
|
||||
"""
|
||||
Test the whole token save/get/delete workflow,
|
||||
with the desired behavior with respect to non-existent tokens.
|
||||
"""
|
||||
token = "token-{}".format(int(time.time()))
|
||||
HfFolder.save_token(token)
|
||||
self.assertEqual(
|
||||
HfFolder.get_token(),
|
||||
token
|
||||
)
|
||||
HfFolder.delete_token()
|
||||
HfFolder.delete_token()
|
||||
# ^^ not an error, we test that the
|
||||
# second call does not fail.
|
||||
self.assertEqual(
|
||||
HfFolder.get_token(),
|
||||
None
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -18,22 +18,21 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
|
||||
AlbertForSequenceClassification, AlbertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@require_torch
|
||||
class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
|
||||
@@ -133,6 +132,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = AlbertModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||
@@ -150,6 +150,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = AlbertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -163,6 +164,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = AlbertForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||
@@ -183,6 +185,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = AlbertForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||
result = {
|
||||
@@ -225,7 +228,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,11 +18,12 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (AutoConfig, BertConfig,
|
||||
AutoModel, BertModel,
|
||||
@@ -33,12 +34,11 @@ if is_torch_available():
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@require_torch
|
||||
class AutoModelTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
|
||||
for value in loading_info.values():
|
||||
self.assertEqual(len(value), 0)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_lmhead_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForMaskedLM)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_classification_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForSequenceClassification)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_question_answering_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForQuestionAnswering)
|
||||
|
||||
def test_from_pretrained_identifier(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
|
||||
self.assertIsInstance(model, BertForMaskedLM)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -18,12 +18,12 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (BertConfig, BertModel, BertForMaskedLM,
|
||||
@@ -31,11 +31,9 @@ if is_torch_available():
|
||||
BertForQuestionAnswering, BertForSequenceClassification,
|
||||
BertForTokenClassification, BertForMultipleChoice)
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("use_cuda")
|
||||
@require_torch
|
||||
class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
|
||||
@@ -67,7 +65,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
device='cpu',
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
@@ -91,26 +88,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
self.device = device
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(self.device)
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(self.device)
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size).to(self.device)
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(self.device)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(self.device)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices).to(self.device)
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
@@ -144,7 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertModel(config=config)
|
||||
model.to(input_ids.device)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||
@@ -161,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
||||
model = BertModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
|
||||
@@ -177,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -190,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
||||
model = BertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
|
||||
@@ -204,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForNextSentencePrediction(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
|
||||
result = {
|
||||
@@ -217,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForPreTraining(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
|
||||
@@ -235,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||
@@ -254,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = BertForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||
result = {
|
||||
@@ -268,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = BertForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||
result = {
|
||||
@@ -282,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_choices = self.num_choices
|
||||
model = BertForMultipleChoice(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||
@@ -313,10 +318,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_bert_model(self, use_cuda=False):
|
||||
# ^^ This could be a real fixture
|
||||
if use_cuda:
|
||||
self.model_tester.device = "cuda"
|
||||
def test_bert_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_bert_model(*config_and_inputs)
|
||||
|
||||
@@ -356,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -27,10 +27,11 @@ import uuid
|
||||
|
||||
import unittest
|
||||
import logging
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
import numpy as np
|
||||
@@ -38,8 +39,6 @@ if is_torch_available():
|
||||
from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
|
||||
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
@@ -65,6 +64,7 @@ def _config_zero_init(config):
|
||||
|
||||
class CommonTestCases:
|
||||
|
||||
@require_torch
|
||||
class CommonModelTester(unittest.TestCase):
|
||||
|
||||
model_tester = None
|
||||
@@ -79,6 +79,7 @@ class CommonTestCases:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -86,12 +87,13 @@ class CommonTestCases:
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(tmpdirname)
|
||||
model.to(torch_device)
|
||||
with torch.no_grad():
|
||||
after_outputs = model(**inputs_dict)
|
||||
|
||||
# Make sure we don't have nans
|
||||
out_1 = after_outputs[0].numpy()
|
||||
out_2 = outputs[0].numpy()
|
||||
out_1 = after_outputs[0].cpu().numpy()
|
||||
out_2 = outputs[0].cpu().numpy()
|
||||
out_1 = out_1[~np.isnan(out_1)]
|
||||
out_2 = out_2[~np.isnan(out_2)]
|
||||
max_diff = np.amax(np.abs(out_1 - out_2))
|
||||
@@ -113,6 +115,7 @@ class CommonTestCases:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
|
||||
self.assertEqual(first.ne(second).sum().item(), 0)
|
||||
@@ -125,6 +128,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
@@ -142,6 +146,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(**inputs_dict)
|
||||
self.assertEqual(out_len+1, len(outputs))
|
||||
@@ -181,6 +186,7 @@ class CommonTestCases:
|
||||
configs_no_init.torchscript = True
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
inputs = inputs_dict['input_ids'] # Let's keep only input_ids
|
||||
|
||||
@@ -201,7 +207,10 @@ class CommonTestCases:
|
||||
except ValueError:
|
||||
self.fail("Couldn't load module.")
|
||||
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loaded_model.to(torch_device)
|
||||
loaded_model.eval()
|
||||
|
||||
model_params = model.parameters()
|
||||
@@ -228,11 +237,12 @@ class CommonTestCases:
|
||||
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
# Prepare head_mask
|
||||
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
||||
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
||||
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
|
||||
head_mask[0, 0] = 0
|
||||
head_mask[-1, :-1] = 0
|
||||
head_mask.requires_grad_(requires_grad=True)
|
||||
@@ -282,6 +292,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
-1: [0]}
|
||||
@@ -310,6 +321,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
-1: [0]}
|
||||
@@ -319,6 +331,7 @@ class CommonTestCases:
|
||||
os.makedirs(directory)
|
||||
model.save_pretrained(directory)
|
||||
model = model_class.from_pretrained(directory)
|
||||
model.to(torch_device)
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
@@ -346,6 +359,7 @@ class CommonTestCases:
|
||||
config.pruned_heads = heads_to_prune
|
||||
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -372,6 +386,7 @@ class CommonTestCases:
|
||||
config.pruned_heads = heads_to_prune
|
||||
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -388,6 +403,7 @@ class CommonTestCases:
|
||||
os.makedirs(directory)
|
||||
model.save_pretrained(directory)
|
||||
model = model_class.from_pretrained(directory)
|
||||
model.to(torch_device)
|
||||
shutil.rmtree(directory)
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -419,6 +435,7 @@ class CommonTestCases:
|
||||
config.output_hidden_states = True
|
||||
config.output_attentions = False
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(**inputs_dict)
|
||||
hidden_states = outputs[-1]
|
||||
@@ -538,6 +555,7 @@ class CommonTestCases:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
@@ -628,6 +646,7 @@ class CommonTestCases:
|
||||
def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = self.base_model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids, position_ids, token_type_ids)
|
||||
@@ -643,6 +662,7 @@ class CommonTestCases:
|
||||
def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = self.lm_head_model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||
loss, lm_logits = outputs[:2]
|
||||
@@ -659,6 +679,7 @@ class CommonTestCases:
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids)
|
||||
presents = outputs[-1]
|
||||
@@ -671,6 +692,7 @@ class CommonTestCases:
|
||||
def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = self.double_head_model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
|
||||
token_type_ids=token_type_ids, position_ids=position_ids)
|
||||
@@ -716,7 +738,7 @@ class CommonTestCases:
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
self.create_and_check_presents(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def run_slow_tests(self):
|
||||
self.create_and_check_model_from_pretrained()
|
||||
|
||||
@@ -770,7 +792,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
|
||||
for _ in range(total_dims):
|
||||
values.append(rng.randint(0, vocab_size - 1))
|
||||
|
||||
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
|
||||
return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
|
||||
|
||||
|
||||
def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
||||
@@ -786,11 +808,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
||||
for _ in range(total_dims):
|
||||
values.append(rng.random() * scale)
|
||||
|
||||
return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
|
||||
return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
|
||||
|
||||
|
||||
@require_torch
|
||||
class ModelUtilsTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -16,7 +16,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
import pdb
|
||||
|
||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
||||
if is_torch_available():
|
||||
from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
CTRLLMHeadModel)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
|
||||
@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = CTRLModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = CTRLLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,7 +17,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -25,13 +24,13 @@ if is_torch_available():
|
||||
from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
|
||||
DistilBertForTokenClassification,
|
||||
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
|
||||
@@ -126,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
(sequence_output,) = model(input_ids, input_mask)
|
||||
(sequence_output,) = model(input_ids)
|
||||
@@ -139,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -152,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||
result = {
|
||||
@@ -170,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = DistilBertForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
|
||||
result = {
|
||||
@@ -184,6 +187,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = DistilBertForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
|
||||
@@ -229,7 +233,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
|
||||
|
||||
# @pytest.mark.slow
|
||||
# @slow
|
||||
# def test_model_from_pretrained(self):
|
||||
# cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -15,19 +15,18 @@
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from .utils import require_torch, slow
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import BertModel, BertForMaskedLM, Model2Model
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@require_torch
|
||||
class EncoderDecoderModelTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model2model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,7 +17,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
|
||||
from transformers import is_torch_available
|
||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
||||
if is_torch_available():
|
||||
from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
|
||||
@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = GPT2Model(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = GPT2LMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
|
||||
model = GPT2DoubleHeadsModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
|
||||
@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,7 +17,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
|
||||
from transformers import is_torch_available
|
||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
||||
if is_torch_available():
|
||||
from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
|
||||
@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTDoubleHeadsModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
||||
@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -27,13 +26,13 @@ if is_torch_available():
|
||||
from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
|
||||
RobertaForSequenceClassification, RobertaForTokenClassification)
|
||||
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
|
||||
@@ -129,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
||||
token_labels, choice_labels):
|
||||
model = RobertaModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||
@@ -146,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
||||
token_labels, choice_labels):
|
||||
model = RobertaForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -161,6 +162,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = RobertaForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
labels=token_labels)
|
||||
@@ -195,7 +197,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -207,10 +209,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
class RobertaModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_masked_lm(self):
|
||||
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = torch.Size((1, 11, 50265))
|
||||
@@ -228,10 +230,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
|
||||
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_no_head(self):
|
||||
model = RobertaModel.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
# compare the actual values for a slice.
|
||||
@@ -244,10 +246,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
|
||||
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_classification_head(self):
|
||||
model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
||||
|
||||
|
||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = torch.Size((1, 3))
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import AlbertConfig, is_tf_available
|
||||
|
||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
||||
from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
|
||||
TFAlbertForSequenceClassification,
|
||||
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (
|
||||
@@ -216,7 +215,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
self.model_tester.create_and_check_albert_for_sequence_classification(
|
||||
*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,11 +18,12 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from transformers import is_tf_available
|
||||
|
||||
from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
|
||||
|
||||
if is_tf_available():
|
||||
from transformers import (AutoConfig, BertConfig,
|
||||
TFAutoModel, TFBertModel,
|
||||
@@ -33,12 +34,11 @@ if is_tf_available():
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFAutoModelTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
import h5py
|
||||
self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
|
||||
@@ -54,7 +54,7 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, TFBertModel)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_lmhead_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -67,7 +67,7 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, TFBertForMaskedLM)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_classification_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -80,7 +80,7 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, TFBertForSequenceClassification)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_question_answering_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -93,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, TFBertForQuestionAnswering)
|
||||
|
||||
def test_from_pretrained_identifier(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True)
|
||||
self.assertIsInstance(model, TFBertForMaskedLM)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import BertConfig, is_tf_available
|
||||
|
||||
@@ -36,10 +36,9 @@ if is_tf_available():
|
||||
TFBertForTokenClassification,
|
||||
TFBertForQuestionAnswering,
|
||||
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
|
||||
@@ -309,7 +308,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -25,18 +25,17 @@ import unittest
|
||||
import uuid
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from transformers import is_tf_available, is_torch_available
|
||||
|
||||
from .utils import require_tf, slow
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from transformers import TFPreTrainedModel
|
||||
# from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
@@ -62,6 +61,7 @@ def _config_zero_init(config):
|
||||
|
||||
class TFCommonTestCases:
|
||||
|
||||
@require_tf
|
||||
class TFCommonModelTester(unittest.TestCase):
|
||||
|
||||
model_tester = None
|
||||
@@ -164,7 +164,7 @@ class TFCommonTestCases:
|
||||
for model_class in self.all_model_classes:
|
||||
# Prepare our model
|
||||
model = model_class(config)
|
||||
|
||||
|
||||
# Let's load it from the disk to be sure we can use pretrained weights
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
outputs = model(inputs_dict) # build the model
|
||||
@@ -233,80 +233,6 @@ class TFCommonTestCases:
|
||||
self.model_tester.seq_length,
|
||||
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
||||
|
||||
def test_headmasking(self):
|
||||
pass
|
||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# config.output_attentions = True
|
||||
# config.output_hidden_states = True
|
||||
# configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
||||
# for model_class in self.all_model_classes:
|
||||
# model = model_class(config=configs_no_init)
|
||||
# model.eval()
|
||||
|
||||
# # Prepare head_mask
|
||||
# # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
||||
# head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
||||
# head_mask[0, 0] = 0
|
||||
# head_mask[-1, :-1] = 0
|
||||
# head_mask.requires_grad_(requires_grad=True)
|
||||
# inputs = inputs_dict.copy()
|
||||
# inputs['head_mask'] = head_mask
|
||||
|
||||
# outputs = model(**inputs)
|
||||
|
||||
# # Test that we can get a gradient back for importance score computation
|
||||
# output = sum(t.sum() for t in outputs[0])
|
||||
# output = output.sum()
|
||||
# output.backward()
|
||||
# multihead_outputs = head_mask.grad
|
||||
|
||||
# attentions = outputs[-1]
|
||||
# hidden_states = outputs[-2]
|
||||
|
||||
# # Remove Nan
|
||||
|
||||
# self.assertIsNotNone(multihead_outputs)
|
||||
# self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
|
||||
# self.assertAlmostEqual(
|
||||
# attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertNotEqual(
|
||||
# attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertNotEqual(
|
||||
# attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertAlmostEqual(
|
||||
# attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertNotEqual(
|
||||
# attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
|
||||
|
||||
|
||||
def test_head_pruning(self):
|
||||
pass
|
||||
# if not self.test_pruning:
|
||||
# return
|
||||
|
||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# for model_class in self.all_model_classes:
|
||||
# config.output_attentions = True
|
||||
# config.output_hidden_states = False
|
||||
# model = model_class(config=config)
|
||||
# model.eval()
|
||||
# heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
# -1: [0]}
|
||||
# model.prune_heads(heads_to_prune)
|
||||
# outputs = model(**inputs_dict)
|
||||
|
||||
# attentions = outputs[-1]
|
||||
|
||||
# self.assertEqual(
|
||||
# attentions[0].shape[-3], 1)
|
||||
# self.assertEqual(
|
||||
# attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
||||
# self.assertEqual(
|
||||
# attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -323,43 +249,6 @@ class TFCommonTestCases:
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[self.model_tester.seq_length, self.model_tester.hidden_size])
|
||||
|
||||
|
||||
def test_resize_tokens_embeddings(self):
|
||||
pass
|
||||
# original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
# if not self.test_resize_embeddings:
|
||||
# return
|
||||
|
||||
# for model_class in self.all_model_classes:
|
||||
# config = copy.deepcopy(original_config)
|
||||
# model = model_class(config)
|
||||
|
||||
# model_vocab_size = config.vocab_size
|
||||
# # Retrieve the embeddings and clone theme
|
||||
# model_embed = model.resize_token_embeddings(model_vocab_size)
|
||||
# cloned_embeddings = model_embed.weight.clone()
|
||||
|
||||
# # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
|
||||
# model_embed = model.resize_token_embeddings(model_vocab_size + 10)
|
||||
# self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
|
||||
# # Check that it actually resizes the embeddings matrix
|
||||
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
|
||||
|
||||
# # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
|
||||
# model_embed = model.resize_token_embeddings(model_vocab_size - 15)
|
||||
# self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
|
||||
# # Check that it actually resizes the embeddings matrix
|
||||
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
|
||||
|
||||
# # Check that adding and removing tokens has not modified the first part of the embedding matrix.
|
||||
# models_equal = True
|
||||
# for p1, p2 in zip(cloned_embeddings, model_embed.weight):
|
||||
# if p1.data.ne(p2.data).sum() > 0:
|
||||
# models_equal = False
|
||||
|
||||
# self.assertTrue(models_equal)
|
||||
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -369,40 +258,6 @@ class TFCommonTestCases:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None or isinstance(x, tf.keras.layers.Layer)
|
||||
|
||||
|
||||
def test_tie_model_weights(self):
|
||||
pass
|
||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# def check_same_values(layer_1, layer_2):
|
||||
# equal = True
|
||||
# for p1, p2 in zip(layer_1.weight, layer_2.weight):
|
||||
# if p1.data.ne(p2.data).sum() > 0:
|
||||
# equal = False
|
||||
# return equal
|
||||
|
||||
# for model_class in self.all_model_classes:
|
||||
# if not hasattr(model_class, 'tie_weights'):
|
||||
# continue
|
||||
|
||||
# config.torchscript = True
|
||||
# model_not_tied = model_class(config)
|
||||
# params_not_tied = list(model_not_tied.parameters())
|
||||
|
||||
# config_tied = copy.deepcopy(config)
|
||||
# config_tied.torchscript = False
|
||||
# model_tied = model_class(config_tied)
|
||||
# params_tied = list(model_tied.parameters())
|
||||
|
||||
# # Check that the embedding layer and decoding layer are the same in size and in value
|
||||
# self.assertGreater(len(params_not_tied), len(params_tied))
|
||||
|
||||
# # Check that after resize they remain tied.
|
||||
# model_tied.resize_token_embeddings(config.vocab_size + 10)
|
||||
# params_tied_2 = list(model_tied.parameters())
|
||||
# self.assertGreater(len(params_not_tied), len(params_tied))
|
||||
# self.assertEqual(len(params_tied_2), len(params_tied))
|
||||
|
||||
def test_determinism(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -461,29 +316,5 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
|
||||
return output
|
||||
|
||||
|
||||
class TFModelUtilsTest(unittest.TestCase):
|
||||
@pytest.mark.skipif('tensorflow' not in sys.modules, reason="requires TensorFlow")
|
||||
def test_model_from_pretrained(self):
|
||||
pass
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
# config = BertConfig.from_pretrained(model_name)
|
||||
# self.assertIsNotNone(config)
|
||||
# self.assertIsInstance(config, PretrainedConfig)
|
||||
|
||||
# model = BertModel.from_pretrained(model_name)
|
||||
# model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
|
||||
# self.assertIsNotNone(model)
|
||||
# self.assertIsInstance(model, PreTrainedModel)
|
||||
# for value in loading_info.values():
|
||||
# self.assertEqual(len(value), 0)
|
||||
|
||||
# config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
# model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
# self.assertEqual(model.config.output_attentions, True)
|
||||
# self.assertEqual(model.config.output_hidden_states, True)
|
||||
# self.assertEqual(model.config, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import CTRLConfig, is_tf_available
|
||||
|
||||
@@ -30,10 +30,9 @@ if is_tf_available():
|
||||
import tensorflow as tf
|
||||
from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
|
||||
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
|
||||
@@ -188,7 +187,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,10 +17,10 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import DistilBertConfig, is_tf_available
|
||||
|
||||
@@ -30,10 +30,9 @@ if is_tf_available():
|
||||
TFDistilBertForMaskedLM,
|
||||
TFDistilBertForQuestionAnswering,
|
||||
TFDistilBertForSequenceClassification)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
|
||||
@@ -210,7 +209,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
|
||||
|
||||
# @pytest.mark.slow
|
||||
# @slow
|
||||
# def test_model_from_pretrained(self):
|
||||
# cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import GPT2Config, is_tf_available
|
||||
|
||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
||||
from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
|
||||
TFGPT2DoubleHeadsModel,
|
||||
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
|
||||
@@ -219,7 +218,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import OpenAIGPTConfig, is_tf_available
|
||||
|
||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
||||
from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
||||
TFOpenAIGPTDoubleHeadsModel,
|
||||
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
||||
@@ -218,7 +217,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,10 +18,10 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import RobertaConfig, is_tf_available
|
||||
|
||||
@@ -32,10 +32,9 @@ if is_tf_available():
|
||||
TFRobertaForSequenceClassification,
|
||||
TFRobertaForTokenClassification,
|
||||
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
|
||||
@@ -191,7 +190,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -203,10 +202,10 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
class TFRobertaModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_masked_lm(self):
|
||||
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = [1, 11, 50265]
|
||||
@@ -224,10 +223,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
|
||||
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_no_head(self):
|
||||
model = TFRobertaModel.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
# compare the actual values for a slice.
|
||||
@@ -240,10 +239,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
|
||||
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_classification_head(self):
|
||||
model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
||||
|
||||
|
||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = [1, 3]
|
||||
|
||||
@@ -19,10 +19,10 @@ from __future__ import print_function
|
||||
import unittest
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import TransfoXLConfig, is_tf_available
|
||||
|
||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
||||
from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
|
||||
TFTransfoXLLMHeadModel,
|
||||
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
|
||||
@@ -204,7 +203,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_tf_available
|
||||
|
||||
@@ -29,13 +28,13 @@ if is_tf_available():
|
||||
TFXLMForSequenceClassification,
|
||||
TFXLMForQuestionAnsweringSimple,
|
||||
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
|
||||
@@ -251,7 +250,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -21,7 +21,6 @@ import unittest
|
||||
import json
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import XLNetConfig, is_tf_available
|
||||
|
||||
@@ -30,18 +29,21 @@ if is_tf_available():
|
||||
|
||||
from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
|
||||
TFXLNetForSequenceClassification,
|
||||
TFXLNetForTokenClassification,
|
||||
TFXLNetForQuestionAnsweringSimple,
|
||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
|
||||
TFXLNetForSequenceClassification,
|
||||
TFXLNetForTokenClassification,
|
||||
TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
|
||||
test_pruning = False
|
||||
|
||||
@@ -258,6 +260,26 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
list(list(mem.shape) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_for_token_classification(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
config.num_labels = input_ids_1.shape[1]
|
||||
model = TFXLNetForTokenClassification(config)
|
||||
inputs = {'input_ids': input_ids_1,
|
||||
'attention_mask': input_mask,
|
||||
# 'token_type_ids': token_type_ids
|
||||
}
|
||||
logits, mems_1 = model(inputs)
|
||||
result = {
|
||||
"mems_1": [mem.numpy() for mem in mems_1],
|
||||
"logits": logits.numpy(),
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].shape),
|
||||
[self.batch_size, self.seq_length, config.num_labels])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.shape) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
@@ -282,19 +304,23 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
def test_xlnet_lm_head(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
|
||||
def test_xlnet_sequence_classif(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
|
||||
|
||||
def test_xlnet_token_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_for_token_classification(*config_and_inputs)
|
||||
|
||||
def test_xlnet_qa(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -19,7 +19,6 @@ from __future__ import print_function
|
||||
import unittest
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -27,12 +26,13 @@ if is_torch_available():
|
||||
import torch
|
||||
from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||
from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
|
||||
@@ -111,6 +111,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||
model = TransfoXLModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
hidden_states_1, mems_1 = model(input_ids_1)
|
||||
@@ -140,6 +141,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||
model = TransfoXLLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
lm_logits_1, mems_1 = model(input_ids_1)
|
||||
@@ -204,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
|
||||
self.model_tester.check_transfo_xl_lm_head_output(output_result)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -26,13 +25,13 @@ if is_torch_available():
|
||||
from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
|
||||
XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
|
||||
from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
|
||||
@@ -148,6 +147,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
|
||||
outputs = model(input_ids, langs=token_type_ids)
|
||||
@@ -163,6 +163,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMWithLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
|
||||
@@ -182,6 +183,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMForQuestionAnsweringSimple(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids)
|
||||
@@ -206,6 +208,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMForQuestionAnswering(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids)
|
||||
@@ -260,6 +263,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
(logits,) = model(input_ids)
|
||||
@@ -312,7 +316,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -21,24 +21,25 @@ import unittest
|
||||
import json
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
|
||||
from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification,
|
||||
XLNetForTokenClassification, XLNetForQuestionAnswering)
|
||||
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes=(XLNetModel, XLNetLMHeadModel,
|
||||
all_model_classes=(XLNetModel, XLNetLMHeadModel, XLNetForTokenClassification,
|
||||
XLNetForSequenceClassification, XLNetForQuestionAnswering) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
|
||||
@@ -99,18 +100,20 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
||||
|
||||
input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
|
||||
perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
|
||||
perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
|
||||
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
||||
target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
|
||||
target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
|
||||
target_mapping[:, 0, -1] = 1.0 # predict last token
|
||||
|
||||
sequence_labels = None
|
||||
lm_labels = None
|
||||
is_impossible_labels = None
|
||||
token_labels = None
|
||||
if self.use_labels:
|
||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
config = XLNetConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
@@ -129,15 +132,16 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
num_labels=self.type_sequence_label_size)
|
||||
|
||||
return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels)
|
||||
|
||||
def set_seed(self):
|
||||
random.seed(self.seed)
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
_, _ = model(input_ids_1, input_mask=input_mask)
|
||||
@@ -152,6 +156,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
config.mem_len = 0
|
||||
model = XLNetModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
no_mems_outputs = model(input_ids_1)
|
||||
self.parent.assertEqual(len(no_mems_outputs), 1)
|
||||
@@ -163,9 +168,23 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
list(list(mem.size()) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_base_model_with_att_output(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
_, _, attentions = model(input_ids_1, target_mapping=target_mapping)
|
||||
|
||||
self.parent.assertEqual(len(attentions), config.n_layer)
|
||||
self.parent.assertIsInstance(attentions[0], tuple)
|
||||
self.parent.assertEqual(len(attentions[0]), 2)
|
||||
self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
|
||||
|
||||
def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
|
||||
@@ -204,8 +223,9 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetForQuestionAnswering(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids_1)
|
||||
@@ -261,9 +281,43 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
list(list(mem.size()) for mem in result["mems"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_token_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetForTokenClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
logits, mems_1 = model(input_ids_1)
|
||||
loss, logits, mems_1 = model(input_ids_1, labels=token_labels)
|
||||
|
||||
result = {
|
||||
"loss": loss,
|
||||
"mems_1": mems_1,
|
||||
"logits": logits,
|
||||
}
|
||||
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].size()),
|
||||
[self.batch_size, self.seq_length, self.type_sequence_label_size])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels,
|
||||
sequence_labels, is_impossible_labels) = config_and_inputs
|
||||
inputs_dict = {'input_ids': input_ids_1}
|
||||
return config, inputs_dict
|
||||
|
||||
def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
logits, mems_1 = model(input_ids_1)
|
||||
@@ -289,7 +343,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels,
|
||||
sequence_labels, is_impossible_labels) = config_and_inputs
|
||||
sequence_labels, is_impossible_labels, token_labels) = config_and_inputs
|
||||
inputs_dict = {'input_ids': input_ids_1}
|
||||
return config, inputs_dict
|
||||
|
||||
@@ -306,22 +360,33 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
|
||||
|
||||
def test_xlnet_base_model_with_att_output(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config_and_inputs[0].output_attentions = True
|
||||
self.model_tester.create_and_check_xlnet_base_model_with_att_output(*config_and_inputs)
|
||||
|
||||
def test_xlnet_lm_head(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
|
||||
def test_xlnet_sequence_classif(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
|
||||
|
||||
def test_xlnet_token_classif(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_token_classif(*config_and_inputs)
|
||||
|
||||
def test_xlnet_qa(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -31,10 +30,9 @@ if is_torch_available():
|
||||
get_cosine_schedule_with_warmup,
|
||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||
get_linear_schedule_with_warmup)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .tokenization_tests_commons import TemporaryDirectory
|
||||
from .utils import require_torch
|
||||
|
||||
|
||||
def unwrap_schedule(scheduler, num_steps=10):
|
||||
@@ -58,6 +56,7 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
|
||||
scheduler.load_state_dict(state_dict)
|
||||
return lrs
|
||||
|
||||
@require_torch
|
||||
class OptimizationTest(unittest.TestCase):
|
||||
|
||||
def assertListAlmostEqual(self, list1, list2, tol):
|
||||
@@ -80,6 +79,7 @@ class OptimizationTest(unittest.TestCase):
|
||||
self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
|
||||
|
||||
|
||||
@require_torch
|
||||
class ScheduleInitTest(unittest.TestCase):
|
||||
m = torch.nn.Linear(50, 50) if is_torch_available() else None
|
||||
optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
|
||||
|
||||
90
transformers/tests/optimization_tf_test.py
Normal file
90
transformers/tests/optimization_tf_test.py
Normal file
@@ -0,0 +1,90 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import is_tf_available
|
||||
|
||||
from .utils import require_tf
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import ops
|
||||
from transformers import (create_optimizer, GradientAccumulator)
|
||||
|
||||
|
||||
@require_tf
|
||||
class OptimizationFTest(unittest.TestCase):
|
||||
def assertListAlmostEqual(self, list1, list2, tol):
|
||||
self.assertEqual(len(list1), len(list2))
|
||||
for a, b in zip(list1, list2):
|
||||
self.assertAlmostEqual(a, b, delta=tol)
|
||||
|
||||
def testGradientAccumulator(self):
|
||||
accumulator = GradientAccumulator()
|
||||
accumulator([tf.constant([1.0, 2.0])])
|
||||
accumulator([tf.constant([-2.0, 1.0])])
|
||||
accumulator([tf.constant([-1.0, 2.0])])
|
||||
with self.assertRaises(ValueError):
|
||||
accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
|
||||
self.assertEqual(accumulator.step, 3)
|
||||
self.assertEqual(len(accumulator.gradients), 1)
|
||||
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2)
|
||||
accumulator.reset()
|
||||
self.assertEqual(accumulator.step, 0)
|
||||
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2)
|
||||
|
||||
def testGradientAccumulatorDistributionStrategy(self):
|
||||
context._context = None
|
||||
ops.enable_eager_execution_internal()
|
||||
physical_devices = tf.config.experimental.list_physical_devices("CPU")
|
||||
tf.config.experimental.set_virtual_device_configuration(
|
||||
physical_devices[0],
|
||||
[tf.config.experimental.VirtualDeviceConfiguration(),
|
||||
tf.config.experimental.VirtualDeviceConfiguration()])
|
||||
|
||||
devices = tf.config.experimental.list_logical_devices(device_type="CPU")
|
||||
strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices])
|
||||
|
||||
with strategy.scope():
|
||||
accumulator = GradientAccumulator()
|
||||
variable = tf.Variable([4.0, 3.0])
|
||||
optimizer = create_optimizer(5e-5, 10, 5)
|
||||
gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
|
||||
|
||||
def accumulate_on_replica(gradient):
|
||||
accumulator([gradient])
|
||||
|
||||
def apply_on_replica():
|
||||
optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])), 1.0)
|
||||
|
||||
@tf.function
|
||||
def accumulate(grad1, grad2):
|
||||
with strategy.scope():
|
||||
gradient_placeholder.values[0].assign(grad1)
|
||||
gradient_placeholder.values[1].assign(grad2)
|
||||
strategy.experimental_run_v2(accumulate_on_replica, args=(gradient_placeholder,))
|
||||
|
||||
@tf.function
|
||||
def apply_grad():
|
||||
with strategy.scope():
|
||||
strategy.experimental_run_v2(apply_on_replica)
|
||||
|
||||
accumulate([1.0, 2.0], [-1.0, 1.0])
|
||||
accumulate([3.0, -1.0], [-1.0, -1.0])
|
||||
accumulate([-2.0, 2.0], [3.0, -2.0])
|
||||
self.assertEqual(accumulator.step, 3)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [2.0, 3.0], tol=1e-2)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [1.0, -2.0], tol=1e-2)
|
||||
apply_grad()
|
||||
self.assertListAlmostEqual(variable.value().numpy().tolist(), [4.0, 3.0], tol=1e-2)
|
||||
accumulator.reset()
|
||||
self.assertEqual(accumulator.step, 0)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -18,15 +18,16 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
|
||||
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
from .utils import slow, SMALL_MODEL_IDENTIFIER
|
||||
|
||||
|
||||
class AutoTokenizerTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_tokenizer_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -41,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
|
||||
self.assertIsInstance(tokenizer, GPT2Tokenizer)
|
||||
self.assertGreater(len(tokenizer), 0)
|
||||
|
||||
def test_tokenizer_from_pretrained_identifier(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
|
||||
self.assertIsInstance(tokenizer, BertTokenizer)
|
||||
self.assertEqual(len(tokenizer), 12)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user