Merge branch 'master' into squad-refactor
This commit is contained in:
@@ -23,3 +23,4 @@ deploy_doc "fe02e45" v1.1.0
|
|||||||
deploy_doc "89fd345" v1.2.0
|
deploy_doc "89fd345" v1.2.0
|
||||||
deploy_doc "fc9faa8" v2.0.0
|
deploy_doc "fc9faa8" v2.0.0
|
||||||
deploy_doc "3ddce1d" v2.1.1
|
deploy_doc "3ddce1d" v2.1.1
|
||||||
|
deploy_doc "3616209" v2.2.0
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ Follow these steps to start contributing:
|
|||||||
```bash
|
```bash
|
||||||
$ git clone git@github.com:<your Github handle>/transformers.git
|
$ git clone git@github.com:<your Github handle>/transformers.git
|
||||||
$ cd transformers
|
$ cd transformers
|
||||||
$ git remote add upstream git@github.com:huggingface/transformers.git
|
$ git remote add upstream https://github.com/huggingface/transformers.git
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Create a new branch to hold your development changes:
|
3. Create a new branch to hold your development changes:
|
||||||
|
|||||||
23
README.md
23
README.md
@@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime
|
|||||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||||
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
||||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||||
| [Documentation](https://huggingface.co/transformers/) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) | Full API documentation and more |
|
| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@@ -89,30 +89,38 @@ pip install [--editable] .
|
|||||||
### Run the examples
|
### Run the examples
|
||||||
|
|
||||||
Examples are included in the repository but are not shipped with the library.
|
Examples are included in the repository but are not shipped with the library.
|
||||||
Therefore, in order to run the examples you will first need to clone the
|
Therefore, in order to run the latest versions of the examples you also need to install from source. To do so, create a new virtual environment and follow these steps:
|
||||||
repository and install the bleeding edge version of the library. To do so, create a new virtual environment and follow these steps:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:huggingface/transformers
|
git clone https://github.com/huggingface/transformers
|
||||||
cd transformers
|
cd transformers
|
||||||
pip install .
|
pip install [--editable] .
|
||||||
```
|
```
|
||||||
|
|
||||||
### Tests
|
### Tests
|
||||||
|
|
||||||
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||||
|
|
||||||
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
These tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
||||||
|
|
||||||
You can run the tests from the root of the cloned repository with the commands:
|
You can run the tests from the root of the cloned repository with the commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||||
|
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m pytest -sv ./transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||||
|
|
||||||
### Do you want to run a Transformer model on a mobile device?
|
### Do you want to run a Transformer model on a mobile device?
|
||||||
|
|
||||||
You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
|
You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
|
||||||
@@ -132,9 +140,10 @@ At some point in the future, you'll be able to seamlessly move from pre-training
|
|||||||
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
|
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
|
||||||
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||||
|
11. **[ALBERT](https://github.com/google-research/google-research/tree/master/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||||
|
|
||||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||||
|
|||||||
22
deploy_multi_version_doc.sh
Normal file
22
deploy_multi_version_doc.sh
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
cd docs
|
||||||
|
|
||||||
|
function deploy_doc(){
|
||||||
|
echo "Creating doc at commit $1 and pushing to folder $2"
|
||||||
|
git checkout $1
|
||||||
|
if [ ! -z "$2" ]
|
||||||
|
then
|
||||||
|
echo "Pushing version" $2
|
||||||
|
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
|
||||||
|
else
|
||||||
|
echo "Pushing master"
|
||||||
|
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
deploy_doc "master"
|
||||||
|
deploy_doc "b33a385" v1.0.0
|
||||||
|
deploy_doc "fe02e45" v1.1.0
|
||||||
|
deploy_doc "89fd345" v1.2.0
|
||||||
|
deploy_doc "fc9faa8" v2.0.0
|
||||||
|
deploy_doc "3ddce1d" v2.1.1
|
||||||
|
deploy_doc "f2f3294" v2.2.0
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
function addIcon() {
|
function addIcon() {
|
||||||
const huggingFaceLogo = "https://huggingface.co/assets/transformers-docs/huggingface_logo.svg";
|
const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
|
||||||
const image = document.createElement("img");
|
const image = document.createElement("img");
|
||||||
image.setAttribute("src", huggingFaceLogo);
|
image.setAttribute("src", huggingFaceLogo);
|
||||||
|
|
||||||
@@ -24,10 +24,10 @@ function addCustomFooter() {
|
|||||||
social.classList.add("footer__Social");
|
social.classList.add("footer__Social");
|
||||||
|
|
||||||
const imageDetails = [
|
const imageDetails = [
|
||||||
{ link: "https://huggingface.co", imageLink: "https://huggingface.co/assets/transformers-docs/website.svg" },
|
{ link: "https://huggingface.co", imageLink: "https://huggingface.co/landing/assets/transformers-docs/website.svg" },
|
||||||
{ link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/twitter.svg" },
|
{ link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/twitter.svg" },
|
||||||
{ link: "https://github.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/github.svg" },
|
{ link: "https://github.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/github.svg" },
|
||||||
{ link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/assets/transformers-docs/linkedin.svg" }
|
{ link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/landing/assets/transformers-docs/linkedin.svg" }
|
||||||
];
|
];
|
||||||
|
|
||||||
imageDetails.forEach(imageLinks => {
|
imageDetails.forEach(imageLinks => {
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ author = u'huggingface'
|
|||||||
# The short X.Y version
|
# The short X.Y version
|
||||||
version = u''
|
version = u''
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = u'2.1.1'
|
release = u'2.2.1'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|||||||
@@ -47,6 +47,9 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||||||
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
||||||
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
|
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
|
||||||
|
9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
|
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
||||||
|
11. `ALBERT <https://github.com/pytorch/fairseq/tree/master/examples/albert>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
@@ -89,3 +92,5 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||||||
model_doc/roberta
|
model_doc/roberta
|
||||||
model_doc/distilbert
|
model_doc/distilbert
|
||||||
model_doc/ctrl
|
model_doc/ctrl
|
||||||
|
model_doc/camembert
|
||||||
|
model_doc/albert
|
||||||
|
|||||||
@@ -24,15 +24,24 @@ pip install [--editable] .
|
|||||||
|
|
||||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||||
|
|
||||||
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
Run all the tests from the root of the cloned repository with the commands:
|
Run all the tests from the root of the cloned repository with the commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||||
|
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python -m pytest -sv ./transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||||
|
|
||||||
## OpenAI GPT original tokenization workflow
|
## OpenAI GPT original tokenization workflow
|
||||||
|
|
||||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ The ``.optimization`` module provides:
|
|||||||
|
|
||||||
- an optimizer with weight decay fixed that can be used to fine-tuned models, and
|
- an optimizer with weight decay fixed that can be used to fine-tuned models, and
|
||||||
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
|
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
|
||||||
|
- a gradient accumulation class to accumulate the gradients of multiple batches
|
||||||
|
|
||||||
``AdamW``
|
``AdamW``
|
||||||
~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~
|
||||||
@@ -12,6 +13,15 @@ The ``.optimization`` module provides:
|
|||||||
.. autoclass:: transformers.AdamW
|
.. autoclass:: transformers.AdamW
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
``AdamWeightDecay``
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AdamWeightDecay
|
||||||
|
:members:
|
||||||
|
|
||||||
|
.. autofunction:: transformers.create_optimizer
|
||||||
|
:members:
|
||||||
|
|
||||||
Schedules
|
Schedules
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
@@ -49,3 +59,17 @@ Learning Rate Schedules
|
|||||||
.. image:: /imgs/warmup_linear_schedule.png
|
.. image:: /imgs/warmup_linear_schedule.png
|
||||||
:target: /imgs/warmup_linear_schedule.png
|
:target: /imgs/warmup_linear_schedule.png
|
||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
``Warmup``
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.Warmup
|
||||||
|
:members:
|
||||||
|
|
||||||
|
Gradient Strategies
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
``GradientAccumulator``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.GradientAccumulator
|
||||||
|
|||||||
@@ -54,10 +54,28 @@ Additionally, the following method can be used to load values from a data file
|
|||||||
Example usage
|
Example usage
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
An example using these processors is given in the
|
An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||||
`run_glue.py <https://github.com/huggingface/transformers/blob/master/examples/run_glue.py>`__ script.
|
|
||||||
|
|
||||||
|
|
||||||
|
XNLI
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
|
||||||
|
the quality of cross-lingual text representations.
|
||||||
|
XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment
|
||||||
|
annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
|
||||||
|
|
||||||
|
It was released together with the paper
|
||||||
|
`XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
|
||||||
|
|
||||||
|
This library hosts the processor to load the XNLI data:
|
||||||
|
- :class:`~transformers.data.processors.utils.XnliProcessor`
|
||||||
|
|
||||||
|
Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
|
||||||
|
|
||||||
|
An example using these processors is given in the
|
||||||
|
`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
|
||||||
|
|
||||||
|
|
||||||
SQuAD
|
SQuAD
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
@@ -89,9 +107,9 @@ that can be used as model inputs.
|
|||||||
These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
|
These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
|
||||||
Examples are given below.
|
Examples are given below.
|
||||||
|
|
||||||
|
|
||||||
Example usage
|
Example usage
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
Here is an example using the processors as well as the conversion method using data files:
|
Here is an example using the processors as well as the conversion method using data files:
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
@@ -132,4 +150,4 @@ Example::
|
|||||||
|
|
||||||
|
|
||||||
Another example using these processors is given in the
|
Another example using these processors is given in the
|
||||||
`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
|
`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
|
||||||
|
|||||||
@@ -104,6 +104,6 @@ for batch in train_data:
|
|||||||
loss = model(batch)
|
loss = model(batch)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
||||||
scheduler.step()
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
scheduler.step()
|
||||||
```
|
```
|
||||||
|
|||||||
64
docs/source/model_doc/albert.rst
Normal file
64
docs/source/model_doc/albert.rst
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
ALBERT
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
``AlbrtConfig``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertConfig
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertTokenizer``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertTokenizer
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertForQuestionAnswering``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFAlbertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFAlbertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFAlbertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFAlbertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFAlbertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFAlbertForSequenceClassification
|
||||||
|
:members:
|
||||||
50
docs/source/model_doc/camembert.rst
Normal file
50
docs/source/model_doc/camembert.rst
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
CamemBERT
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
``CamembertConfig``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertConfig
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertTokenizer``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertTokenizer
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForMultipleChoice``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForMultipleChoice
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForTokenClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForTokenClassification
|
||||||
|
:members:
|
||||||
@@ -151,6 +151,14 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||||
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
||||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||||
|
| | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. |
|
||||||
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``distilbert-base-multilingual-cased`` | | 6-layer, 768-hidden, 12-heads, 134M parameters |
|
||||||
|
| | | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint. |
|
||||||
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters |
|
| CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters |
|
||||||
| | | | Salesforce's Large-sized CTRL English model |
|
| | | | Salesforce's Large-sized CTRL English model |
|
||||||
@@ -159,5 +167,38 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| | | | CamemBERT using the BERT-base architecture |
|
| | | | CamemBERT using the BERT-base architecture |
|
||||||
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__) |
|
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||||
|
| | | | ALBERT base model |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||||
|
| | | | ALBERT large model |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||||
|
| | | | ALBERT xlarge model |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||||
|
| | | | ALBERT xxlarge model |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||||
|
| | | | ALBERT base model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||||
|
| | | | ALBERT large model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||||
|
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||||
|
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/google-research/tree/master/albert>`__) |
|
||||||
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
|
|
||||||
.. <https://huggingface.co/transformers/examples.html>`__
|
.. <https://huggingface.co/transformers/examples.html>`__
|
||||||
|
|||||||
@@ -4,12 +4,14 @@ In this section a few examples are put together. All of these examples work for
|
|||||||
similar API between the different models.
|
similar API between the different models.
|
||||||
|
|
||||||
**Important**
|
**Important**
|
||||||
To use the examples, execute the following steps in a new virtual environment:
|
To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
|
||||||
|
Execute the following steps in a new virtual environment:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:huggingface/transformers
|
git clone https://github.com/huggingface/transformers
|
||||||
cd transformers
|
cd transformers
|
||||||
pip install .
|
pip install [--editable] .
|
||||||
|
pip install -r ./examples/requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
@@ -21,6 +23,7 @@ pip install .
|
|||||||
| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
|
| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
|
||||||
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
||||||
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
||||||
|
| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
|
||||||
| [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. |
|
| [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. |
|
||||||
|
|
||||||
## TensorFlow 2.0 Bert models on GLUE
|
## TensorFlow 2.0 Bert models on GLUE
|
||||||
@@ -464,7 +467,8 @@ Training with the previously defined hyper-parameters yields the following resul
|
|||||||
|
|
||||||
## Named Entity Recognition
|
## Named Entity Recognition
|
||||||
|
|
||||||
Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py).
|
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
|
||||||
|
[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
|
||||||
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
||||||
Details and results for the fine-tuning provided by @stefan-it.
|
Details and results for the fine-tuning provided by @stefan-it.
|
||||||
|
|
||||||
@@ -509,7 +513,7 @@ The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so
|
|||||||
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### Training
|
### Prepare the run
|
||||||
|
|
||||||
Additional environment variables must be set:
|
Additional environment variables must be set:
|
||||||
|
|
||||||
@@ -521,6 +525,8 @@ export SAVE_STEPS=750
|
|||||||
export SEED=1
|
export SEED=1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Run the Pytorch version
|
||||||
|
|
||||||
To start training, just run:
|
To start training, just run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -541,7 +547,7 @@ python3 run_ner.py --data_dir ./ \
|
|||||||
|
|
||||||
If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
||||||
|
|
||||||
### Evaluation
|
#### Evaluation
|
||||||
|
|
||||||
Evaluation on development dataset outputs the following for our example:
|
Evaluation on development dataset outputs the following for our example:
|
||||||
|
|
||||||
@@ -563,7 +569,7 @@ On the test dataset the following results could be achieved:
|
|||||||
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
|
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
|
||||||
```
|
```
|
||||||
|
|
||||||
### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
|
#### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
|
||||||
|
|
||||||
Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
|
Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
|
||||||
|
|
||||||
@@ -573,6 +579,72 @@ Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) a
|
|||||||
| `roberta-large` | 95.96 | 91.87
|
| `roberta-large` | 95.96 | 91.87
|
||||||
| `distilbert-base-uncased` | 94.34 | 90.32
|
| `distilbert-base-uncased` | 94.34 | 90.32
|
||||||
|
|
||||||
|
### Run the Tensorflow 2 version
|
||||||
|
|
||||||
|
To start training, just run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 run_tf_ner.py --data_dir ./ \
|
||||||
|
--model_type bert \
|
||||||
|
--labels ./labels.txt \
|
||||||
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
--output_dir $OUTPUT_DIR \
|
||||||
|
--max_seq_length $MAX_LENGTH \
|
||||||
|
--num_train_epochs $NUM_EPOCHS \
|
||||||
|
--per_device_train_batch_size $BATCH_SIZE \
|
||||||
|
--save_steps $SAVE_STEPS \
|
||||||
|
--seed $SEED \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_predict
|
||||||
|
```
|
||||||
|
|
||||||
|
Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
||||||
|
|
||||||
|
#### Evaluation
|
||||||
|
|
||||||
|
Evaluation on development dataset outputs the following for our example:
|
||||||
|
```bash
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
LOCderiv 0.7619 0.6154 0.6809 52
|
||||||
|
PERpart 0.8724 0.8997 0.8858 4057
|
||||||
|
OTHpart 0.9360 0.9466 0.9413 711
|
||||||
|
ORGpart 0.7015 0.6989 0.7002 269
|
||||||
|
LOCpart 0.7668 0.8488 0.8057 496
|
||||||
|
LOC 0.8745 0.9191 0.8963 235
|
||||||
|
ORGderiv 0.7723 0.8571 0.8125 91
|
||||||
|
OTHderiv 0.4800 0.6667 0.5581 18
|
||||||
|
OTH 0.5789 0.6875 0.6286 16
|
||||||
|
PERderiv 0.5385 0.3889 0.4516 18
|
||||||
|
PER 0.5000 0.5000 0.5000 2
|
||||||
|
ORG 0.0000 0.0000 0.0000 3
|
||||||
|
|
||||||
|
micro avg 0.8574 0.8862 0.8715 5968
|
||||||
|
macro avg 0.8575 0.8862 0.8713 5968
|
||||||
|
```
|
||||||
|
|
||||||
|
On the test dataset the following results could be achieved:
|
||||||
|
```bash
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
PERpart 0.8847 0.8944 0.8896 9397
|
||||||
|
OTHpart 0.9376 0.9353 0.9365 1639
|
||||||
|
ORGpart 0.7307 0.7044 0.7173 697
|
||||||
|
LOC 0.9133 0.9394 0.9262 561
|
||||||
|
LOCpart 0.8058 0.8157 0.8107 1150
|
||||||
|
ORG 0.0000 0.0000 0.0000 8
|
||||||
|
OTHderiv 0.5882 0.4762 0.5263 42
|
||||||
|
PERderiv 0.6571 0.5227 0.5823 44
|
||||||
|
OTH 0.4906 0.6667 0.5652 39
|
||||||
|
ORGderiv 0.7016 0.7791 0.7383 172
|
||||||
|
LOCderiv 0.8256 0.6514 0.7282 109
|
||||||
|
PER 0.0000 0.0000 0.0000 11
|
||||||
|
|
||||||
|
micro avg 0.8722 0.8774 0.8748 13869
|
||||||
|
macro avg 0.8712 0.8774 0.8740 13869
|
||||||
|
```
|
||||||
|
|
||||||
## Abstractive summarization
|
## Abstractive summarization
|
||||||
|
|
||||||
Based on the script
|
Based on the script
|
||||||
@@ -600,3 +672,43 @@ python run_summarization_finetuning.py \
|
|||||||
--do_train \
|
--do_train \
|
||||||
--data_path=$DATA_PATH \
|
--data_path=$DATA_PATH \
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## XNLI
|
||||||
|
|
||||||
|
Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
|
||||||
|
|
||||||
|
[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
|
||||||
|
|
||||||
|
#### Fine-tuning on XNLI
|
||||||
|
|
||||||
|
This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
|
||||||
|
on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a
|
||||||
|
`$XNLI_DIR` directory.
|
||||||
|
|
||||||
|
* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
|
||||||
|
* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export XNLI_DIR=/path/to/XNLI
|
||||||
|
|
||||||
|
python run_xnli.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-multilingual-cased \
|
||||||
|
--language de \
|
||||||
|
--train_language en \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--data_dir $XNLI_DIR \
|
||||||
|
--per_gpu_train_batch_size 32 \
|
||||||
|
--learning_rate 5e-5 \
|
||||||
|
--num_train_epochs 2.0 \
|
||||||
|
--max_seq_length 128 \
|
||||||
|
--output_dir /tmp/debug_xnli/ \
|
||||||
|
--save_steps -1
|
||||||
|
```
|
||||||
|
|
||||||
|
Training with the previously defined hyper-parameters yields the following results on the **test** set:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
acc = 0.7093812375249501
|
||||||
|
```
|
||||||
|
|||||||
@@ -2,6 +2,10 @@
|
|||||||
|
|
||||||
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
|
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
|
||||||
|
|
||||||
|
**December 6th, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||||
|
|
||||||
|
**November 19th, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
|
||||||
|
|
||||||
**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
|
**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
|
||||||
|
|
||||||
**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
|
**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
|
||||||
@@ -15,8 +19,9 @@ Distil* is a class of compressed models that started with DistilBERT. DistilBERT
|
|||||||
|
|
||||||
We have applied the same method to other Transformer architectures and released the weights:
|
We have applied the same method to other Transformer architectures and released the weights:
|
||||||
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
|
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
|
||||||
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base` performance on GLUE while being twice faster and 35% smaller.
|
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
|
||||||
- and more to come! 🤗🤗🤗
|
- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
|
||||||
|
- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||||
|
|
||||||
For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
|
For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
|
||||||
|
|
||||||
@@ -27,7 +32,7 @@ Here are the results on the dev sets of GLUE:
|
|||||||
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
||||||
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
||||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||||
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
|
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
|
||||||
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
|
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
|
||||||
|
|
||||||
<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
|
<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
|
||||||
@@ -36,6 +41,14 @@ Here are the results on the dev sets of GLUE:
|
|||||||
|
|
||||||
<sup>3</sup> We compute this score ourselves for completeness.
|
<sup>3</sup> We compute this score ourselves for completeness.
|
||||||
|
|
||||||
|
Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
|
||||||
|
|
||||||
|
| Model | English | Spanish | Chinese | German | Arabic | Urdu |
|
||||||
|
| :---: | :---: | :---: | :---: | :---: | :---: | :---:|
|
||||||
|
| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 |
|
||||||
|
| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 |
|
||||||
|
| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 |
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||||
@@ -45,13 +58,14 @@ This part of the library has only be tested with Python3.6+. There are few speci
|
|||||||
|
|
||||||
## How to use DistilBERT
|
## How to use DistilBERT
|
||||||
|
|
||||||
Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
||||||
|
|
||||||
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
||||||
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
||||||
|
- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
|
||||||
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
|
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
|
||||||
- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
|
- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
|
||||||
- and more to come! 🤗🤗🤗
|
- `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.
|
||||||
|
|
||||||
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
||||||
|
|
||||||
@@ -67,6 +81,7 @@ last_hidden_states = outputs[0] # The last hidden-state is the first element of
|
|||||||
Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
|
Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
|
||||||
- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
|
- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
|
||||||
- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
|
- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
|
||||||
|
- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
|
||||||
|
|
||||||
|
|
||||||
## How to train Distil*
|
## How to train Distil*
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ import psutil
|
|||||||
import time
|
import time
|
||||||
from tqdm import trange, tqdm
|
from tqdm import trange, tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import psutil
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|||||||
@@ -3,4 +3,4 @@ tensorboard>=1.14.0
|
|||||||
tensorboardX==1.8
|
tensorboardX==1.8
|
||||||
psutil==5.6.3
|
psutil==5.6.3
|
||||||
scipy==1.3.1
|
scipy==1.3.1
|
||||||
transformers==2.0.0
|
transformers
|
||||||
|
|||||||
54
examples/pplm/README.md
Normal file
54
examples/pplm/README.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# Plug and Play Language Models: a Simple Approach to Controlled Text Generation
|
||||||
|
|
||||||
|
Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
|
||||||
|
|
||||||
|
This folder contains the original code used to run the Plug and Play Language Model (PPLM).
|
||||||
|
|
||||||
|
Paper link: https://arxiv.org/abs/1912.02164
|
||||||
|
|
||||||
|
Blog link: https://eng.uber.com/pplm
|
||||||
|
|
||||||
|
Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
|
||||||
|
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers && cd transformers
|
||||||
|
pip install [--editable] .
|
||||||
|
pip install nltk torchtext # additional requirements.
|
||||||
|
cd examples/pplm
|
||||||
|
```
|
||||||
|
|
||||||
|
## PPLM-BoW
|
||||||
|
|
||||||
|
### Example command for bag-of-words control
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tuning hyperparameters for bag-of-words control
|
||||||
|
|
||||||
|
1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
|
||||||
|
|
||||||
|
2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
|
||||||
|
a) Reduce the `--stepsize` </br>
|
||||||
|
b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
|
||||||
|
c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
|
||||||
|
|
||||||
|
|
||||||
|
## PPLM-Discrim
|
||||||
|
|
||||||
|
### Example command for discriminator based sentiment control
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tuning hyperparameters for discriminator control
|
||||||
|
|
||||||
|
1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
|
||||||
|
|
||||||
|
2. Use `--class_label 3` for negative, and `--class_label 2` for positive
|
||||||
|
|
||||||
BIN
examples/pplm/imgs/headfigure.png
Normal file
BIN
examples/pplm/imgs/headfigure.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 653 KiB |
BIN
examples/pplm/imgs/wooly.png
Normal file
BIN
examples/pplm/imgs/wooly.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 664 KiB |
18
examples/pplm/pplm_classification_head.py
Normal file
18
examples/pplm/pplm_classification_head.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
class ClassificationHead(torch.nn.Module):
|
||||||
|
"""Classification Head for transformer encoders"""
|
||||||
|
|
||||||
|
def __init__(self, class_size, embed_size):
|
||||||
|
super(ClassificationHead, self).__init__()
|
||||||
|
self.class_size = class_size
|
||||||
|
self.embed_size = embed_size
|
||||||
|
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
|
||||||
|
# self.mlp2 = (torch.nn.Linear(embed_size, class_size))
|
||||||
|
self.mlp = torch.nn.Linear(embed_size, class_size)
|
||||||
|
|
||||||
|
def forward(self, hidden_state):
|
||||||
|
# hidden_state = F.relu(self.mlp1(hidden_state))
|
||||||
|
# hidden_state = self.mlp2(hidden_state)
|
||||||
|
logits = self.mlp(hidden_state)
|
||||||
|
return logits
|
||||||
879
examples/pplm/run_pplm.py
Normal file
879
examples/pplm/run_pplm.py
Normal file
@@ -0,0 +1,879 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||||
|
#
|
||||||
|
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
#you may not use this file except in compliance with the License.
|
||||||
|
#You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
#http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
#Unless required by applicable law or agreed to in writing, software
|
||||||
|
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
#See the License for the specific language governing permissions and
|
||||||
|
#limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example command with bag of words:
|
||||||
|
python examples/run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.01 --window_length 5 --kl_scale 0.01 --gm_scale 0.95
|
||||||
|
|
||||||
|
Example command with discriminator:
|
||||||
|
python examples/run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length 10 --gamma 1.0 --num_iterations 30 --num_samples 10 --stepsize 0.01 --kl_scale 0.01 --gm_scale 0.95
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from operator import add
|
||||||
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.autograd import Variable
|
||||||
|
from tqdm import trange
|
||||||
|
|
||||||
|
from transformers import GPT2Tokenizer
|
||||||
|
from transformers.file_utils import cached_path
|
||||||
|
from transformers.modeling_gpt2 import GPT2LMHeadModel
|
||||||
|
from pplm_classification_head import ClassificationHead
|
||||||
|
|
||||||
|
PPLM_BOW = 1
|
||||||
|
PPLM_DISCRIM = 2
|
||||||
|
PPLM_BOW_DISCRIM = 3
|
||||||
|
SMALL_CONST = 1e-15
|
||||||
|
BIG_CONST = 1e10
|
||||||
|
|
||||||
|
BAG_OF_WORDS_ARCHIVE_MAP = {
|
||||||
|
'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
|
||||||
|
'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
|
||||||
|
'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
|
||||||
|
'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
|
||||||
|
'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
|
||||||
|
'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
|
||||||
|
'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
|
||||||
|
}
|
||||||
|
|
||||||
|
DISCRIMINATOR_MODELS_PARAMS = {
|
||||||
|
"clickbait": {
|
||||||
|
"url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/clickbait_classifier_head.pt",
|
||||||
|
"class_size": 2,
|
||||||
|
"embed_size": 1024,
|
||||||
|
"class_vocab": {"non_clickbait": 0, "clickbait": 1},
|
||||||
|
"default_class": 1,
|
||||||
|
"pretrained_model": "gpt2-medium",
|
||||||
|
},
|
||||||
|
"sentiment": {
|
||||||
|
"url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
|
||||||
|
"class_size": 5,
|
||||||
|
"embed_size": 1024,
|
||||||
|
"class_vocab": {"very_positive": 2, "very_negative": 3},
|
||||||
|
"default_class": 3,
|
||||||
|
"pretrained_model": "gpt2-medium",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def to_var(x, requires_grad=False, volatile=False, device='cuda'):
|
||||||
|
if torch.cuda.is_available() and device == 'cuda':
|
||||||
|
x = x.cuda()
|
||||||
|
elif device != 'cuda':
|
||||||
|
x = x.to(device)
|
||||||
|
return Variable(x, requires_grad=requires_grad, volatile=volatile)
|
||||||
|
|
||||||
|
|
||||||
|
def top_k_filter(logits, k, probs=False):
|
||||||
|
"""
|
||||||
|
Masks everything but the k top entries as -infinity (1e10).
|
||||||
|
Used to mask logits such that e^-infinity -> 0 won't contribute to the
|
||||||
|
sum of the denominator.
|
||||||
|
"""
|
||||||
|
if k == 0:
|
||||||
|
return logits
|
||||||
|
else:
|
||||||
|
values = torch.topk(logits, k)[0]
|
||||||
|
batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
|
||||||
|
if probs:
|
||||||
|
return torch.where(logits < batch_mins,
|
||||||
|
torch.ones_like(logits) * 0.0, logits)
|
||||||
|
return torch.where(logits < batch_mins,
|
||||||
|
torch.ones_like(logits) * -BIG_CONST,
|
||||||
|
logits)
|
||||||
|
|
||||||
|
|
||||||
|
def perturb_past(
|
||||||
|
past,
|
||||||
|
model,
|
||||||
|
last,
|
||||||
|
unpert_past=None,
|
||||||
|
unpert_logits=None,
|
||||||
|
accumulated_hidden=None,
|
||||||
|
grad_norms=None,
|
||||||
|
stepsize=0.01,
|
||||||
|
one_hot_bows_vectors=None,
|
||||||
|
classifier=None,
|
||||||
|
class_label=None,
|
||||||
|
loss_type=0,
|
||||||
|
num_iterations=3,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
kl_scale=0.01,
|
||||||
|
device='cuda',
|
||||||
|
):
|
||||||
|
# Generate inital perturbed past
|
||||||
|
grad_accumulator = [
|
||||||
|
(np.zeros(p.shape).astype("float32"))
|
||||||
|
for p in past
|
||||||
|
]
|
||||||
|
|
||||||
|
if accumulated_hidden is None:
|
||||||
|
accumulated_hidden = 0
|
||||||
|
|
||||||
|
if decay:
|
||||||
|
decay_mask = torch.arange(
|
||||||
|
0.,
|
||||||
|
1.0 + SMALL_CONST,
|
||||||
|
1.0 / (window_length)
|
||||||
|
)[1:]
|
||||||
|
else:
|
||||||
|
decay_mask = 1.0
|
||||||
|
|
||||||
|
# TODO fix this comment (SUMANTH)
|
||||||
|
# Generate a mask is gradient perturbated is based on a past window
|
||||||
|
_, _, _, curr_length, _ = past[0].shape
|
||||||
|
|
||||||
|
if curr_length > window_length and window_length > 0:
|
||||||
|
ones_key_val_shape = (
|
||||||
|
tuple(past[0].shape[:-2])
|
||||||
|
+ tuple([window_length])
|
||||||
|
+ tuple(past[0].shape[-1:])
|
||||||
|
)
|
||||||
|
|
||||||
|
zeros_key_val_shape = (
|
||||||
|
tuple(past[0].shape[:-2])
|
||||||
|
+ tuple([curr_length - window_length])
|
||||||
|
+ tuple(past[0].shape[-1:])
|
||||||
|
)
|
||||||
|
|
||||||
|
ones_mask = torch.ones(ones_key_val_shape)
|
||||||
|
ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
|
||||||
|
ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
|
||||||
|
|
||||||
|
window_mask = torch.cat(
|
||||||
|
(ones_mask, torch.zeros(zeros_key_val_shape)),
|
||||||
|
dim=-2
|
||||||
|
).to(device)
|
||||||
|
else:
|
||||||
|
window_mask = torch.ones_like(past[0]).to(device)
|
||||||
|
|
||||||
|
# accumulate perturbations for num_iterations
|
||||||
|
loss_per_iter = []
|
||||||
|
new_accumulated_hidden = None
|
||||||
|
for i in range(num_iterations):
|
||||||
|
print("Iteration ", i + 1)
|
||||||
|
curr_perturbation = [
|
||||||
|
to_var(torch.from_numpy(p_), requires_grad=True, device=device)
|
||||||
|
for p_ in grad_accumulator
|
||||||
|
]
|
||||||
|
|
||||||
|
# Compute hidden using perturbed past
|
||||||
|
perturbed_past = list(map(add, past, curr_perturbation))
|
||||||
|
_, _, _, curr_length, _ = curr_perturbation[0].shape
|
||||||
|
all_logits, _, all_hidden = model(last, past=perturbed_past)
|
||||||
|
hidden = all_hidden[-1]
|
||||||
|
new_accumulated_hidden = accumulated_hidden + torch.sum(
|
||||||
|
hidden,
|
||||||
|
dim=1
|
||||||
|
).detach()
|
||||||
|
# TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
|
||||||
|
logits = all_logits[:, -1, :]
|
||||||
|
probs = F.softmax(logits, dim=-1)
|
||||||
|
|
||||||
|
loss = 0.0
|
||||||
|
loss_list = []
|
||||||
|
if loss_type == PPLM_BOW or loss_type == PPLM_BOW_DISCRIM:
|
||||||
|
for one_hot_bow in one_hot_bows_vectors:
|
||||||
|
bow_logits = torch.mm(probs, torch.t(one_hot_bow))
|
||||||
|
bow_loss = -torch.log(torch.sum(bow_logits))
|
||||||
|
loss += bow_loss
|
||||||
|
loss_list.append(bow_loss)
|
||||||
|
print(" pplm_bow_loss:", loss.data.cpu().numpy())
|
||||||
|
|
||||||
|
if loss_type == 2 or loss_type == 3:
|
||||||
|
ce_loss = torch.nn.CrossEntropyLoss()
|
||||||
|
# TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
|
||||||
|
curr_unpert_past = unpert_past
|
||||||
|
curr_probs = torch.unsqueeze(probs, dim=1)
|
||||||
|
wte = model.resize_token_embeddings()
|
||||||
|
for _ in range(horizon_length):
|
||||||
|
inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
|
||||||
|
_, curr_unpert_past, curr_all_hidden = model(
|
||||||
|
past=curr_unpert_past,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
curr_hidden = curr_all_hidden[-1]
|
||||||
|
new_accumulated_hidden = new_accumulated_hidden + torch.sum(
|
||||||
|
curr_hidden, dim=1)
|
||||||
|
|
||||||
|
prediction = classifier(new_accumulated_hidden /
|
||||||
|
(curr_length + 1 + horizon_length))
|
||||||
|
|
||||||
|
label = torch.tensor(prediction.shape[0] * [class_label],
|
||||||
|
device=device,
|
||||||
|
dtype=torch.long)
|
||||||
|
discrim_loss = ce_loss(prediction, label)
|
||||||
|
print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
|
||||||
|
loss += discrim_loss
|
||||||
|
loss_list.append(discrim_loss)
|
||||||
|
|
||||||
|
kl_loss = 0.0
|
||||||
|
if kl_scale > 0.0:
|
||||||
|
unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
|
||||||
|
unpert_probs = (
|
||||||
|
unpert_probs + SMALL_CONST *
|
||||||
|
(unpert_probs <= SMALL_CONST).float().to(device).detach()
|
||||||
|
)
|
||||||
|
correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(
|
||||||
|
device).detach()
|
||||||
|
corrected_probs = probs + correction.detach()
|
||||||
|
kl_loss = kl_scale * (
|
||||||
|
(corrected_probs * (corrected_probs / unpert_probs).log()).sum()
|
||||||
|
)
|
||||||
|
print(' kl_loss', kl_loss.data.cpu().numpy())
|
||||||
|
loss += kl_loss
|
||||||
|
|
||||||
|
loss_per_iter.append(loss.data.cpu().numpy())
|
||||||
|
print(' pplm_loss', (loss - kl_loss).data.cpu().numpy())
|
||||||
|
|
||||||
|
# compute gradients
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
# calculate gradient norms
|
||||||
|
if grad_norms is not None and loss_type == PPLM_BOW:
|
||||||
|
grad_norms = [
|
||||||
|
torch.max(grad_norms[index], torch.norm(p_.grad * window_mask))
|
||||||
|
for index, p_ in enumerate(curr_perturbation)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
grad_norms = [
|
||||||
|
(torch.norm(p_.grad * window_mask) + SMALL_CONST)
|
||||||
|
for index, p_ in enumerate(curr_perturbation)
|
||||||
|
]
|
||||||
|
|
||||||
|
# normalize gradients
|
||||||
|
grad = [
|
||||||
|
-stepsize *
|
||||||
|
(p_.grad * window_mask / grad_norms[
|
||||||
|
index] ** gamma).data.cpu().numpy()
|
||||||
|
for index, p_ in enumerate(curr_perturbation)
|
||||||
|
]
|
||||||
|
|
||||||
|
# accumulate gradient
|
||||||
|
grad_accumulator = list(map(add, grad, grad_accumulator))
|
||||||
|
|
||||||
|
# reset gradients, just to make sure
|
||||||
|
for p_ in curr_perturbation:
|
||||||
|
p_.grad.data.zero_()
|
||||||
|
|
||||||
|
# removing past from the graph
|
||||||
|
new_past = []
|
||||||
|
for p_ in past:
|
||||||
|
new_past.append(p_.detach())
|
||||||
|
past = new_past
|
||||||
|
|
||||||
|
# apply the accumulated perturbations to the past
|
||||||
|
grad_accumulator = [
|
||||||
|
to_var(torch.from_numpy(p_), requires_grad=True, device=device)
|
||||||
|
for p_ in grad_accumulator
|
||||||
|
]
|
||||||
|
pert_past = list(map(add, past, grad_accumulator))
|
||||||
|
|
||||||
|
return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
|
||||||
|
|
||||||
|
|
||||||
|
def get_classifier(
|
||||||
|
name: Optional[str], class_label: Union[str, int],
|
||||||
|
device: str
|
||||||
|
) -> Tuple[Optional[ClassificationHead], Optional[int]]:
|
||||||
|
if name is None:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
params = DISCRIMINATOR_MODELS_PARAMS[name]
|
||||||
|
classifier = ClassificationHead(
|
||||||
|
class_size=params['class_size'],
|
||||||
|
embed_size=params['embed_size']
|
||||||
|
).to(device)
|
||||||
|
if "url" in params:
|
||||||
|
resolved_archive_file = cached_path(params["url"])
|
||||||
|
elif "path" in params:
|
||||||
|
resolved_archive_file = params["path"]
|
||||||
|
else:
|
||||||
|
raise ValueError("Either url or path have to be specified "
|
||||||
|
"in the discriminator model parameters")
|
||||||
|
classifier.load_state_dict(
|
||||||
|
torch.load(resolved_archive_file, map_location=device))
|
||||||
|
classifier.eval()
|
||||||
|
|
||||||
|
if isinstance(class_label, str):
|
||||||
|
if class_label in params["class_vocab"]:
|
||||||
|
label_id = params["class_vocab"][class_label]
|
||||||
|
else:
|
||||||
|
label_id = params["default_class"]
|
||||||
|
print("class_label {} not in class_vocab".format(class_label))
|
||||||
|
print("available values are: {}".format(params["class_vocab"]))
|
||||||
|
print("using default class {}".format(label_id))
|
||||||
|
|
||||||
|
elif isinstance(class_label, int):
|
||||||
|
if class_label in set(params["class_vocab"].values()):
|
||||||
|
label_id = class_label
|
||||||
|
else:
|
||||||
|
label_id = params["default_class"]
|
||||||
|
print("class_label {} not in class_vocab".format(class_label))
|
||||||
|
print("available values are: {}".format(params["class_vocab"]))
|
||||||
|
print("using default class {}".format(label_id))
|
||||||
|
|
||||||
|
else:
|
||||||
|
label_id = params["default_class"]
|
||||||
|
|
||||||
|
return classifier, label_id
|
||||||
|
|
||||||
|
|
||||||
|
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
|
||||||
|
List[List[List[int]]]:
|
||||||
|
bow_indices = []
|
||||||
|
for id_or_path in bag_of_words_ids_or_paths:
|
||||||
|
if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
|
||||||
|
filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
|
||||||
|
else:
|
||||||
|
filepath = id_or_path
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
words = f.read().strip().split("\n")
|
||||||
|
bow_indices.append(
|
||||||
|
[tokenizer.encode(word.strip(), add_prefix_space=True) for word in
|
||||||
|
words])
|
||||||
|
return bow_indices
|
||||||
|
|
||||||
|
|
||||||
|
def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
|
||||||
|
if bow_indices is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
one_hot_bows_vectors = []
|
||||||
|
for single_bow in bow_indices:
|
||||||
|
single_bow = list(filter(lambda x: len(x) <= 1, single_bow))
|
||||||
|
single_bow = torch.tensor(single_bow).to(device)
|
||||||
|
num_words = single_bow.shape[0]
|
||||||
|
one_hot_bow = torch.zeros(num_words, tokenizer.vocab_size).to(device)
|
||||||
|
one_hot_bow.scatter_(1, single_bow, 1)
|
||||||
|
one_hot_bows_vectors.append(one_hot_bow)
|
||||||
|
return one_hot_bows_vectors
|
||||||
|
|
||||||
|
|
||||||
|
def full_text_generation(
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
context=None,
|
||||||
|
num_samples=1,
|
||||||
|
device="cuda",
|
||||||
|
bag_of_words=None,
|
||||||
|
discrim=None,
|
||||||
|
class_label=None,
|
||||||
|
length=100,
|
||||||
|
stepsize=0.02,
|
||||||
|
temperature=1.0,
|
||||||
|
top_k=10,
|
||||||
|
sample=False,
|
||||||
|
num_iterations=3,
|
||||||
|
grad_length=10000,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
gm_scale=0.9,
|
||||||
|
kl_scale=0.01,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
classifier, class_id = get_classifier(
|
||||||
|
discrim,
|
||||||
|
class_label,
|
||||||
|
device
|
||||||
|
)
|
||||||
|
|
||||||
|
bow_indices = []
|
||||||
|
if bag_of_words:
|
||||||
|
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
|
||||||
|
tokenizer)
|
||||||
|
|
||||||
|
if bag_of_words and classifier:
|
||||||
|
print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
|
||||||
|
loss_type = PPLM_BOW_DISCRIM
|
||||||
|
|
||||||
|
elif bag_of_words:
|
||||||
|
loss_type = PPLM_BOW
|
||||||
|
print("Using PPLM-BoW")
|
||||||
|
|
||||||
|
elif classifier is not None:
|
||||||
|
loss_type = PPLM_DISCRIM
|
||||||
|
print("Using PPLM-Discrim")
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception("Specify either a bag of words or a discriminator")
|
||||||
|
|
||||||
|
unpert_gen_tok_text, _, _ = generate_text_pplm(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
context=context,
|
||||||
|
device=device,
|
||||||
|
length=length,
|
||||||
|
sample=sample,
|
||||||
|
perturb=False
|
||||||
|
)
|
||||||
|
if device == 'cuda':
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
pert_gen_tok_texts = []
|
||||||
|
discrim_losses = []
|
||||||
|
losses_in_time = []
|
||||||
|
|
||||||
|
for i in range(num_samples):
|
||||||
|
pert_gen_tok_text, discrim_loss, loss_in_time = generate_text_pplm(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
context=context,
|
||||||
|
device=device,
|
||||||
|
perturb=True,
|
||||||
|
bow_indices=bow_indices,
|
||||||
|
classifier=classifier,
|
||||||
|
class_label=class_id,
|
||||||
|
loss_type=loss_type,
|
||||||
|
length=length,
|
||||||
|
stepsize=stepsize,
|
||||||
|
temperature=temperature,
|
||||||
|
top_k=top_k,
|
||||||
|
sample=sample,
|
||||||
|
num_iterations=num_iterations,
|
||||||
|
grad_length=grad_length,
|
||||||
|
horizon_length=horizon_length,
|
||||||
|
window_length=window_length,
|
||||||
|
decay=decay,
|
||||||
|
gamma=gamma,
|
||||||
|
gm_scale=gm_scale,
|
||||||
|
kl_scale=kl_scale,
|
||||||
|
)
|
||||||
|
pert_gen_tok_texts.append(pert_gen_tok_text)
|
||||||
|
if classifier is not None:
|
||||||
|
discrim_losses.append(discrim_loss.data.cpu().numpy())
|
||||||
|
losses_in_time.append(loss_in_time)
|
||||||
|
|
||||||
|
if device == 'cuda':
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
|
||||||
|
|
||||||
|
|
||||||
|
def generate_text_pplm(
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
context=None,
|
||||||
|
past=None,
|
||||||
|
device="cuda",
|
||||||
|
perturb=True,
|
||||||
|
bow_indices=None,
|
||||||
|
classifier=None,
|
||||||
|
class_label=None,
|
||||||
|
loss_type=0,
|
||||||
|
length=100,
|
||||||
|
stepsize=0.02,
|
||||||
|
temperature=1.0,
|
||||||
|
top_k=10,
|
||||||
|
sample=False,
|
||||||
|
num_iterations=3,
|
||||||
|
grad_length=10000,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
gm_scale=0.9,
|
||||||
|
kl_scale=0.01,
|
||||||
|
):
|
||||||
|
output_so_far = None
|
||||||
|
if context:
|
||||||
|
context_t = torch.tensor(context, device=device, dtype=torch.long)
|
||||||
|
while len(context_t.shape) < 2:
|
||||||
|
context_t = context_t.unsqueeze(0)
|
||||||
|
output_so_far = context_t
|
||||||
|
|
||||||
|
# collect one hot vectors for bags of words
|
||||||
|
one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer,
|
||||||
|
device)
|
||||||
|
|
||||||
|
grad_norms = None
|
||||||
|
last = None
|
||||||
|
unpert_discrim_loss = 0
|
||||||
|
loss_in_time = []
|
||||||
|
for i in trange(length, ascii=True):
|
||||||
|
|
||||||
|
# Get past/probs for current output, except for last word
|
||||||
|
# Note that GPT takes 2 inputs: past + current_token
|
||||||
|
|
||||||
|
# run model forward to obtain unperturbed
|
||||||
|
if past is None and output_so_far is not None:
|
||||||
|
last = output_so_far[:, -1:]
|
||||||
|
if output_so_far.shape[1] > 1:
|
||||||
|
_, past, _ = model(output_so_far[:, :-1])
|
||||||
|
|
||||||
|
unpert_logits, unpert_past, unpert_all_hidden = model(output_so_far)
|
||||||
|
unpert_last_hidden = unpert_all_hidden[-1]
|
||||||
|
|
||||||
|
# check if we are abowe grad max length
|
||||||
|
if i >= grad_length:
|
||||||
|
current_stepsize = stepsize * 0
|
||||||
|
else:
|
||||||
|
current_stepsize = stepsize
|
||||||
|
|
||||||
|
# modify the past if necessary
|
||||||
|
if not perturb or num_iterations == 0:
|
||||||
|
pert_past = past
|
||||||
|
|
||||||
|
else:
|
||||||
|
accumulated_hidden = unpert_last_hidden[:, :-1, :]
|
||||||
|
accumulated_hidden = torch.sum(accumulated_hidden, dim=1)
|
||||||
|
|
||||||
|
if past is not None:
|
||||||
|
pert_past, _, grad_norms, loss_this_iter = perturb_past(
|
||||||
|
past,
|
||||||
|
model,
|
||||||
|
last,
|
||||||
|
unpert_past=unpert_past,
|
||||||
|
unpert_logits=unpert_logits,
|
||||||
|
accumulated_hidden=accumulated_hidden,
|
||||||
|
grad_norms=grad_norms,
|
||||||
|
stepsize=current_stepsize,
|
||||||
|
one_hot_bows_vectors=one_hot_bows_vectors,
|
||||||
|
classifier=classifier,
|
||||||
|
class_label=class_label,
|
||||||
|
loss_type=loss_type,
|
||||||
|
num_iterations=num_iterations,
|
||||||
|
horizon_length=horizon_length,
|
||||||
|
window_length=window_length,
|
||||||
|
decay=decay,
|
||||||
|
gamma=gamma,
|
||||||
|
kl_scale=kl_scale,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
loss_in_time.append(loss_this_iter)
|
||||||
|
else:
|
||||||
|
pert_past = past
|
||||||
|
|
||||||
|
pert_logits, past, pert_all_hidden = model(last, past=pert_past)
|
||||||
|
pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST
|
||||||
|
pert_probs = F.softmax(pert_logits, dim=-1)
|
||||||
|
|
||||||
|
if classifier is not None:
|
||||||
|
ce_loss = torch.nn.CrossEntropyLoss()
|
||||||
|
prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
|
||||||
|
label = torch.tensor([class_label], device=device,
|
||||||
|
dtype=torch.long)
|
||||||
|
unpert_discrim_loss = ce_loss(prediction, label)
|
||||||
|
print(
|
||||||
|
"unperturbed discrim loss",
|
||||||
|
unpert_discrim_loss.data.cpu().numpy()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
unpert_discrim_loss = 0
|
||||||
|
|
||||||
|
# Fuse the modified model and original model
|
||||||
|
if perturb:
|
||||||
|
|
||||||
|
unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
|
||||||
|
|
||||||
|
pert_probs = ((pert_probs ** gm_scale) * (
|
||||||
|
unpert_probs ** (1 - gm_scale))) # + SMALL_CONST
|
||||||
|
pert_probs = top_k_filter(pert_probs, k=top_k,
|
||||||
|
probs=True) # + SMALL_CONST
|
||||||
|
|
||||||
|
# rescale
|
||||||
|
if torch.sum(pert_probs) <= 1:
|
||||||
|
pert_probs = pert_probs / torch.sum(pert_probs)
|
||||||
|
|
||||||
|
else:
|
||||||
|
pert_logits = top_k_filter(pert_logits, k=top_k) # + SMALL_CONST
|
||||||
|
pert_probs = F.softmax(pert_logits, dim=-1)
|
||||||
|
|
||||||
|
# sample or greedy
|
||||||
|
if sample:
|
||||||
|
last = torch.multinomial(pert_probs, num_samples=1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
_, last = torch.topk(pert_probs, k=1, dim=-1)
|
||||||
|
|
||||||
|
# update context/output_so_far appending the new token
|
||||||
|
output_so_far = (
|
||||||
|
last if output_so_far is None
|
||||||
|
else torch.cat((output_so_far, last), dim=1)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(tokenizer.decode(output_so_far.tolist()[0]))
|
||||||
|
|
||||||
|
return output_so_far, unpert_discrim_loss, loss_in_time
|
||||||
|
|
||||||
|
|
||||||
|
def set_generic_model_params(discrim_weights, discrim_meta):
|
||||||
|
if discrim_weights is None:
|
||||||
|
raise ValueError('When using a generic discriminator, '
|
||||||
|
'discrim_weights need to be specified')
|
||||||
|
if discrim_meta is None:
|
||||||
|
raise ValueError('When using a generic discriminator, '
|
||||||
|
'discrim_meta need to be specified')
|
||||||
|
|
||||||
|
with open(discrim_meta, 'r') as discrim_meta_file:
|
||||||
|
meta = json.load(discrim_meta_file)
|
||||||
|
meta['path'] = discrim_weights
|
||||||
|
DISCRIMINATOR_MODELS_PARAMS['generic'] = meta
|
||||||
|
|
||||||
|
|
||||||
|
def run_pplm_example(
|
||||||
|
pretrained_model="gpt2-medium",
|
||||||
|
cond_text="",
|
||||||
|
uncond=False,
|
||||||
|
num_samples=1,
|
||||||
|
bag_of_words=None,
|
||||||
|
discrim=None,
|
||||||
|
discrim_weights=None,
|
||||||
|
discrim_meta=None,
|
||||||
|
class_label=-1,
|
||||||
|
length=100,
|
||||||
|
stepsize=0.02,
|
||||||
|
temperature=1.0,
|
||||||
|
top_k=10,
|
||||||
|
sample=False,
|
||||||
|
num_iterations=3,
|
||||||
|
grad_length=10000,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
gm_scale=0.9,
|
||||||
|
kl_scale=0.01,
|
||||||
|
seed=0,
|
||||||
|
no_cuda=False,
|
||||||
|
colorama=False
|
||||||
|
):
|
||||||
|
# set Random seed
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
np.random.seed(seed)
|
||||||
|
|
||||||
|
# set the device
|
||||||
|
device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
|
||||||
|
|
||||||
|
if discrim == 'generic':
|
||||||
|
set_generic_model_params(discrim_weights, discrim_meta)
|
||||||
|
|
||||||
|
if discrim is not None:
|
||||||
|
pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
|
||||||
|
"pretrained_model"
|
||||||
|
]
|
||||||
|
print("discrim = {}, pretrained_model set "
|
||||||
|
"to discriminator's = {}".format(discrim, pretrained_model))
|
||||||
|
|
||||||
|
# load pretrained model
|
||||||
|
model = GPT2LMHeadModel.from_pretrained(
|
||||||
|
pretrained_model,
|
||||||
|
output_hidden_states=True
|
||||||
|
)
|
||||||
|
model.to(device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# load tokenizer
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
||||||
|
|
||||||
|
# Freeze GPT-2 weights
|
||||||
|
for param in model.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
# figure out conditioning text
|
||||||
|
if uncond:
|
||||||
|
tokenized_cond_text = tokenizer.encode(
|
||||||
|
[tokenizer.bos_token]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_text = cond_text
|
||||||
|
while not raw_text:
|
||||||
|
print("Did you forget to add `--cond_text`? ")
|
||||||
|
raw_text = input("Model prompt >>> ")
|
||||||
|
tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)
|
||||||
|
|
||||||
|
print("= Prefix of sentence =")
|
||||||
|
print(tokenizer.decode(tokenized_cond_text))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# generate unperturbed and perturbed texts
|
||||||
|
|
||||||
|
# full_text_generation returns:
|
||||||
|
# unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
|
||||||
|
unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
context=tokenized_cond_text,
|
||||||
|
device=device,
|
||||||
|
num_samples=num_samples,
|
||||||
|
bag_of_words=bag_of_words,
|
||||||
|
discrim=discrim,
|
||||||
|
class_label=class_label,
|
||||||
|
length=length,
|
||||||
|
stepsize=stepsize,
|
||||||
|
temperature=temperature,
|
||||||
|
top_k=top_k,
|
||||||
|
sample=sample,
|
||||||
|
num_iterations=num_iterations,
|
||||||
|
grad_length=grad_length,
|
||||||
|
horizon_length=horizon_length,
|
||||||
|
window_length=window_length,
|
||||||
|
decay=decay,
|
||||||
|
gamma=gamma,
|
||||||
|
gm_scale=gm_scale,
|
||||||
|
kl_scale=kl_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
# untokenize unperturbed text
|
||||||
|
unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("= Unperturbed generated text =")
|
||||||
|
print(unpert_gen_text)
|
||||||
|
print()
|
||||||
|
|
||||||
|
generated_texts = []
|
||||||
|
|
||||||
|
bow_word_ids = set()
|
||||||
|
if bag_of_words and colorama:
|
||||||
|
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
|
||||||
|
tokenizer)
|
||||||
|
for single_bow_list in bow_indices:
|
||||||
|
# filtering all words in the list composed of more than 1 token
|
||||||
|
filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
|
||||||
|
# w[0] because we are sure w has only 1 item because previous fitler
|
||||||
|
bow_word_ids.update(w[0] for w in filtered)
|
||||||
|
|
||||||
|
# iterate through the perturbed texts
|
||||||
|
for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
|
||||||
|
try:
|
||||||
|
# untokenize unperturbed text
|
||||||
|
if colorama:
|
||||||
|
import colorama
|
||||||
|
|
||||||
|
pert_gen_text = ''
|
||||||
|
for word_id in pert_gen_tok_text.tolist()[0]:
|
||||||
|
if word_id in bow_word_ids:
|
||||||
|
pert_gen_text += '{}{}{}'.format(
|
||||||
|
colorama.Fore.RED,
|
||||||
|
tokenizer.decode([word_id]),
|
||||||
|
colorama.Style.RESET_ALL
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pert_gen_text += tokenizer.decode([word_id])
|
||||||
|
else:
|
||||||
|
pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])
|
||||||
|
|
||||||
|
print("= Perturbed generated text {} =".format(i + 1))
|
||||||
|
print(pert_gen_text)
|
||||||
|
print()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# keep the prefix, perturbed seq, original seq for each index
|
||||||
|
generated_texts.append(
|
||||||
|
(tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--pretrained_model",
|
||||||
|
"-M",
|
||||||
|
type=str,
|
||||||
|
default="gpt2-medium",
|
||||||
|
help="pretrained model name or path to local checkpoint",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cond_text", type=str, default="The lake",
|
||||||
|
help="Prefix texts to condition on"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--uncond", action="store_true",
|
||||||
|
help="Generate from end-of-text as prefix"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num_samples",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of samples to generate from the modified latents",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--bag_of_words",
|
||||||
|
"-B",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Bags of words used for PPLM-BoW. "
|
||||||
|
"Either a BOW id (see list in code) or a filepath. "
|
||||||
|
"Multiple BoWs separated by ;",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--discrim",
|
||||||
|
"-D",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
choices=("clickbait", "sentiment", "toxicity", "generic"),
|
||||||
|
help="Discriminator to use",
|
||||||
|
)
|
||||||
|
parser.add_argument('--discrim_weights', type=str, default=None,
|
||||||
|
help='Weights for the generic discriminator')
|
||||||
|
parser.add_argument('--discrim_meta', type=str, default=None,
|
||||||
|
help='Meta information for the generic discriminator')
|
||||||
|
parser.add_argument(
|
||||||
|
"--class_label",
|
||||||
|
type=int,
|
||||||
|
default=-1,
|
||||||
|
help="Class label used for the discriminator",
|
||||||
|
)
|
||||||
|
parser.add_argument("--length", type=int, default=100)
|
||||||
|
parser.add_argument("--stepsize", type=float, default=0.02)
|
||||||
|
parser.add_argument("--temperature", type=float, default=1.0)
|
||||||
|
parser.add_argument("--top_k", type=int, default=10)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sample", action="store_true",
|
||||||
|
help="Generate from end-of-text as prefix"
|
||||||
|
)
|
||||||
|
parser.add_argument("--num_iterations", type=int, default=3)
|
||||||
|
parser.add_argument("--grad_length", type=int, default=10000)
|
||||||
|
parser.add_argument(
|
||||||
|
"--window_length",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Length of past which is being optimized; "
|
||||||
|
"0 corresponds to infinite window length",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--horizon_length",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Length of future to optimize over",
|
||||||
|
)
|
||||||
|
parser.add_argument("--decay", action="store_true",
|
||||||
|
help="whether to decay or not")
|
||||||
|
parser.add_argument("--gamma", type=float, default=1.5)
|
||||||
|
parser.add_argument("--gm_scale", type=float, default=0.9)
|
||||||
|
parser.add_argument("--kl_scale", type=float, default=0.01)
|
||||||
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument("--no_cuda", action="store_true", help="no cuda")
|
||||||
|
parser.add_argument("--colorama", action="store_true",
|
||||||
|
help="colors keywords")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
run_pplm_example(**vars(args))
|
||||||
588
examples/pplm/run_pplm_discrim_train.py
Normal file
588
examples/pplm/run_pplm_discrim_train.py
Normal file
@@ -0,0 +1,588 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||||
|
#
|
||||||
|
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
#you may not use this file except in compliance with the License.
|
||||||
|
#You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
#http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
#Unless required by applicable law or agreed to in writing, software
|
||||||
|
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
#See the License for the specific language governing permissions and
|
||||||
|
#limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torch.optim
|
||||||
|
import torch.optim as optim
|
||||||
|
import torch.utils.data as data
|
||||||
|
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
||||||
|
from torchtext import data as torchtext_data
|
||||||
|
from torchtext import datasets
|
||||||
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
from pplm_classification_head import ClassificationHead
|
||||||
|
|
||||||
|
torch.manual_seed(0)
|
||||||
|
np.random.seed(0)
|
||||||
|
EPSILON = 1e-10
|
||||||
|
example_sentence = "This is incredible! I love it, this is the best chicken I have ever had."
|
||||||
|
max_length_seq = 100
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Discriminator(torch.nn.Module):
|
||||||
|
"""Transformer encoder followed by a Classification Head"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
class_size,
|
||||||
|
pretrained_model="gpt2-medium",
|
||||||
|
cached_mode=False,
|
||||||
|
device='cpu'
|
||||||
|
):
|
||||||
|
super(Discriminator, self).__init__()
|
||||||
|
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
||||||
|
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
|
||||||
|
self.embed_size = self.encoder.transformer.config.hidden_size
|
||||||
|
self.classifier_head = ClassificationHead(
|
||||||
|
class_size=class_size,
|
||||||
|
embed_size=self.embed_size
|
||||||
|
)
|
||||||
|
self.cached_mode = cached_mode
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
def get_classifier(self):
|
||||||
|
return self.classifier_head
|
||||||
|
|
||||||
|
def train_custom(self):
|
||||||
|
for param in self.encoder.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
self.classifier_head.train()
|
||||||
|
|
||||||
|
def avg_representation(self, x):
|
||||||
|
mask = x.ne(0).unsqueeze(2).repeat(
|
||||||
|
1, 1, self.embed_size
|
||||||
|
).float().to(self.device).detach()
|
||||||
|
hidden, _ = self.encoder.transformer(x)
|
||||||
|
masked_hidden = hidden * mask
|
||||||
|
avg_hidden = torch.sum(masked_hidden, dim=1) / (
|
||||||
|
torch.sum(mask, dim=1).detach() + EPSILON
|
||||||
|
)
|
||||||
|
return avg_hidden
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.cached_mode:
|
||||||
|
avg_hidden = x.to(self.device)
|
||||||
|
else:
|
||||||
|
avg_hidden = self.avg_representation(x.to(self.device))
|
||||||
|
|
||||||
|
logits = self.classifier_head(avg_hidden)
|
||||||
|
probs = F.log_softmax(logits, dim=-1)
|
||||||
|
|
||||||
|
return probs
|
||||||
|
|
||||||
|
|
||||||
|
class Dataset(data.Dataset):
|
||||||
|
def __init__(self, X, y):
|
||||||
|
"""Reads source and target sequences from txt files."""
|
||||||
|
self.X = X
|
||||||
|
self.y = y
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.X)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
"""Returns one data pair (source and target)."""
|
||||||
|
data = {}
|
||||||
|
data["X"] = self.X[index]
|
||||||
|
data["y"] = self.y[index]
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def collate_fn(data):
|
||||||
|
def pad_sequences(sequences):
|
||||||
|
lengths = [len(seq) for seq in sequences]
|
||||||
|
|
||||||
|
padded_sequences = torch.zeros(
|
||||||
|
len(sequences),
|
||||||
|
max(lengths)
|
||||||
|
).long() # padding value = 0
|
||||||
|
|
||||||
|
for i, seq in enumerate(sequences):
|
||||||
|
end = lengths[i]
|
||||||
|
padded_sequences[i, :end] = seq[:end]
|
||||||
|
|
||||||
|
return padded_sequences, lengths
|
||||||
|
|
||||||
|
item_info = {}
|
||||||
|
for key in data[0].keys():
|
||||||
|
item_info[key] = [d[key] for d in data]
|
||||||
|
|
||||||
|
x_batch, _ = pad_sequences(item_info["X"])
|
||||||
|
y_batch = torch.tensor(item_info["y"], dtype=torch.long)
|
||||||
|
|
||||||
|
return x_batch, y_batch
|
||||||
|
|
||||||
|
|
||||||
|
def cached_collate_fn(data):
|
||||||
|
item_info = {}
|
||||||
|
for key in data[0].keys():
|
||||||
|
item_info[key] = [d[key] for d in data]
|
||||||
|
|
||||||
|
x_batch = torch.cat(item_info["X"], 0)
|
||||||
|
y_batch = torch.tensor(item_info["y"], dtype=torch.long)
|
||||||
|
|
||||||
|
return x_batch, y_batch
|
||||||
|
|
||||||
|
|
||||||
|
def train_epoch(data_loader, discriminator, optimizer,
|
||||||
|
epoch=0, log_interval=10, device='cpu'):
|
||||||
|
samples_so_far = 0
|
||||||
|
discriminator.train_custom()
|
||||||
|
for batch_idx, (input_t, target_t) in enumerate(data_loader):
|
||||||
|
input_t, target_t = input_t.to(device), target_t.to(device)
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
output_t = discriminator(input_t)
|
||||||
|
loss = F.nll_loss(output_t, target_t)
|
||||||
|
loss.backward(retain_graph=True)
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
samples_so_far += len(input_t)
|
||||||
|
|
||||||
|
if batch_idx % log_interval == 0:
|
||||||
|
print(
|
||||||
|
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
|
||||||
|
epoch + 1,
|
||||||
|
samples_so_far, len(data_loader.dataset),
|
||||||
|
100 * samples_so_far / len(data_loader.dataset), loss.item()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_performance(data_loader, discriminator, device='cpu'):
|
||||||
|
discriminator.eval()
|
||||||
|
test_loss = 0
|
||||||
|
correct = 0
|
||||||
|
with torch.no_grad():
|
||||||
|
for input_t, target_t in data_loader:
|
||||||
|
input_t, target_t = input_t.to(device), target_t.to(device)
|
||||||
|
output_t = discriminator(input_t)
|
||||||
|
# sum up batch loss
|
||||||
|
test_loss += F.nll_loss(output_t, target_t, reduction="sum").item()
|
||||||
|
# get the index of the max log-probability
|
||||||
|
pred_t = output_t.argmax(dim=1, keepdim=True)
|
||||||
|
correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
|
||||||
|
|
||||||
|
test_loss /= len(data_loader.dataset)
|
||||||
|
|
||||||
|
print(
|
||||||
|
"Performance on test set: "
|
||||||
|
"Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
|
||||||
|
test_loss, correct, len(data_loader.dataset),
|
||||||
|
100. * correct / len(data_loader.dataset)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def predict(input_sentence, model, classes, cached=False, device='cpu'):
|
||||||
|
input_t = model.tokenizer.encode(input_sentence)
|
||||||
|
input_t = torch.tensor([input_t], dtype=torch.long, device=device)
|
||||||
|
if cached:
|
||||||
|
input_t = model.avg_representation(input_t)
|
||||||
|
|
||||||
|
log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
|
||||||
|
print("Input sentence:", input_sentence)
|
||||||
|
print("Predictions:", ", ".join(
|
||||||
|
"{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in
|
||||||
|
zip(classes, log_probs)
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_data_loader(dataset, batch_size, discriminator,
|
||||||
|
shuffle=False, device='cpu'):
|
||||||
|
data_loader = torch.utils.data.DataLoader(dataset=dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
collate_fn=collate_fn)
|
||||||
|
|
||||||
|
xs = []
|
||||||
|
ys = []
|
||||||
|
for batch_idx, (x, y) in enumerate(tqdm(data_loader, ascii=True)):
|
||||||
|
with torch.no_grad():
|
||||||
|
x = x.to(device)
|
||||||
|
avg_rep = discriminator.avg_representation(x).cpu().detach()
|
||||||
|
avg_rep_list = torch.unbind(avg_rep.unsqueeze(1))
|
||||||
|
xs += avg_rep_list
|
||||||
|
ys += y.cpu().numpy().tolist()
|
||||||
|
|
||||||
|
data_loader = torch.utils.data.DataLoader(
|
||||||
|
dataset=Dataset(xs, ys),
|
||||||
|
batch_size=batch_size,
|
||||||
|
shuffle=shuffle,
|
||||||
|
collate_fn=cached_collate_fn)
|
||||||
|
|
||||||
|
return data_loader
|
||||||
|
|
||||||
|
|
||||||
|
def train_discriminator(
|
||||||
|
dataset, dataset_fp=None, pretrained_model="gpt2-medium",
|
||||||
|
epochs=10, batch_size=64, log_interval=10,
|
||||||
|
save_model=False, cached=False, no_cuda=False):
|
||||||
|
device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
|
||||||
|
|
||||||
|
print("Preprocessing {} dataset...".format(dataset))
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
if dataset == "SST":
|
||||||
|
idx2class = ["positive", "negative", "very positive", "very negative",
|
||||||
|
"neutral"]
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
text = torchtext_data.Field()
|
||||||
|
label = torchtext_data.Field(sequential=False)
|
||||||
|
train_data, val_data, test_data = datasets.SST.splits(
|
||||||
|
text,
|
||||||
|
label,
|
||||||
|
fine_grained=True,
|
||||||
|
train_subtrees=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
for i in trange(len(train_data), ascii=True):
|
||||||
|
seq = TreebankWordDetokenizer().detokenize(
|
||||||
|
vars(train_data[i])["text"]
|
||||||
|
)
|
||||||
|
seq = discriminator.tokenizer.encode(seq)
|
||||||
|
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||||
|
x.append(seq)
|
||||||
|
y.append(class2idx[vars(train_data[i])["label"]])
|
||||||
|
train_dataset = Dataset(x, y)
|
||||||
|
|
||||||
|
test_x = []
|
||||||
|
test_y = []
|
||||||
|
for i in trange(len(test_data), ascii=True):
|
||||||
|
seq = TreebankWordDetokenizer().detokenize(
|
||||||
|
vars(test_data[i])["text"]
|
||||||
|
)
|
||||||
|
seq = discriminator.tokenizer.encode(seq)
|
||||||
|
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||||
|
test_x.append(seq)
|
||||||
|
test_y.append(class2idx[vars(test_data[i])["label"]])
|
||||||
|
test_dataset = Dataset(test_x, test_y)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
elif dataset == "clickbait":
|
||||||
|
idx2class = ["non_clickbait", "clickbait"]
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
|
||||||
|
data = []
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
try:
|
||||||
|
data.append(eval(line))
|
||||||
|
except:
|
||||||
|
print("Error evaluating line {}: {}".format(
|
||||||
|
i, line
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
|
||||||
|
for i, line in enumerate(tqdm(f, ascii=True)):
|
||||||
|
try:
|
||||||
|
d = eval(line)
|
||||||
|
seq = discriminator.tokenizer.encode(d["text"])
|
||||||
|
|
||||||
|
if len(seq) < max_length_seq:
|
||||||
|
seq = torch.tensor(
|
||||||
|
[50256] + seq, device=device, dtype=torch.long
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("Line {} is longer than maximum length {}".format(
|
||||||
|
i, max_length_seq
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
x.append(seq)
|
||||||
|
y.append(d["label"])
|
||||||
|
except:
|
||||||
|
print("Error evaluating / tokenizing"
|
||||||
|
" line {}, skipping it".format(i))
|
||||||
|
pass
|
||||||
|
|
||||||
|
full_dataset = Dataset(x, y)
|
||||||
|
train_size = int(0.9 * len(full_dataset))
|
||||||
|
test_size = len(full_dataset) - train_size
|
||||||
|
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||||
|
full_dataset, [train_size, test_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
elif dataset == "toxic":
|
||||||
|
idx2class = ["non_toxic", "toxic"]
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
with open("datasets/toxic/toxic_train.txt") as f:
|
||||||
|
for i, line in enumerate(tqdm(f, ascii=True)):
|
||||||
|
try:
|
||||||
|
d = eval(line)
|
||||||
|
seq = discriminator.tokenizer.encode(d["text"])
|
||||||
|
|
||||||
|
if len(seq) < max_length_seq:
|
||||||
|
seq = torch.tensor(
|
||||||
|
[50256] + seq, device=device, dtype=torch.long
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("Line {} is longer than maximum length {}".format(
|
||||||
|
i, max_length_seq
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
x.append(seq)
|
||||||
|
y.append(int(np.sum(d["label"]) > 0))
|
||||||
|
except:
|
||||||
|
print("Error evaluating / tokenizing"
|
||||||
|
" line {}, skipping it".format(i))
|
||||||
|
pass
|
||||||
|
|
||||||
|
full_dataset = Dataset(x, y)
|
||||||
|
train_size = int(0.9 * len(full_dataset))
|
||||||
|
test_size = len(full_dataset) - train_size
|
||||||
|
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||||
|
full_dataset, [train_size, test_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
else: # if dataset == "generic":
|
||||||
|
# This assumes the input dataset is a TSV with the following structure:
|
||||||
|
# class \t text
|
||||||
|
|
||||||
|
if dataset_fp is None:
|
||||||
|
raise ValueError("When generic dataset is selected, "
|
||||||
|
"dataset_fp needs to be specified aswell.")
|
||||||
|
|
||||||
|
classes = set()
|
||||||
|
with open(dataset_fp) as f:
|
||||||
|
csv_reader = csv.reader(f, delimiter="\t")
|
||||||
|
for row in tqdm(csv_reader, ascii=True):
|
||||||
|
if row:
|
||||||
|
classes.add(row[0])
|
||||||
|
|
||||||
|
idx2class = sorted(classes)
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
with open(dataset_fp) as f:
|
||||||
|
csv_reader = csv.reader(f, delimiter="\t")
|
||||||
|
for i, row in enumerate(tqdm(csv_reader, ascii=True)):
|
||||||
|
if row:
|
||||||
|
label = row[0]
|
||||||
|
text = row[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
seq = discriminator.tokenizer.encode(text)
|
||||||
|
if (len(seq) < max_length_seq):
|
||||||
|
seq = torch.tensor(
|
||||||
|
[50256] + seq,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.long
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"Line {} is longer than maximum length {}".format(
|
||||||
|
i, max_length_seq
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
x.append(seq)
|
||||||
|
y.append(class2idx[label])
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("Error tokenizing line {}, skipping it".format(i))
|
||||||
|
pass
|
||||||
|
|
||||||
|
full_dataset = Dataset(x, y)
|
||||||
|
train_size = int(0.9 * len(full_dataset))
|
||||||
|
test_size = len(full_dataset) - train_size
|
||||||
|
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||||
|
full_dataset,
|
||||||
|
[train_size, test_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("Preprocessed {} data points".format(
|
||||||
|
len(train_dataset) + len(test_dataset))
|
||||||
|
)
|
||||||
|
print("Data preprocessing took: {:.3f}s".format(end - start))
|
||||||
|
|
||||||
|
if cached:
|
||||||
|
print("Building representation cache...")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
train_loader = get_cached_data_loader(
|
||||||
|
train_dataset, batch_size, discriminator,
|
||||||
|
shuffle=True, device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
test_loader = get_cached_data_loader(
|
||||||
|
test_dataset, batch_size, discriminator, device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("Building representation cache took: {:.3f}s".format(end - start))
|
||||||
|
|
||||||
|
else:
|
||||||
|
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
shuffle=True,
|
||||||
|
collate_fn=collate_fn)
|
||||||
|
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
collate_fn=collate_fn)
|
||||||
|
|
||||||
|
if save_model:
|
||||||
|
with open("{}_classifier_head_meta.json".format(dataset),
|
||||||
|
"w") as meta_file:
|
||||||
|
json.dump(discriminator_meta, meta_file)
|
||||||
|
|
||||||
|
optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
|
||||||
|
|
||||||
|
for epoch in range(epochs):
|
||||||
|
start = time.time()
|
||||||
|
print("\nEpoch", epoch + 1)
|
||||||
|
|
||||||
|
train_epoch(
|
||||||
|
discriminator=discriminator,
|
||||||
|
data_loader=train_loader,
|
||||||
|
optimizer=optimizer,
|
||||||
|
epoch=epoch,
|
||||||
|
log_interval=log_interval,
|
||||||
|
device=device
|
||||||
|
)
|
||||||
|
evaluate_performance(
|
||||||
|
data_loader=test_loader,
|
||||||
|
discriminator=discriminator,
|
||||||
|
device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("Epoch took: {:.3f}s".format(end - start))
|
||||||
|
|
||||||
|
print("\nExample prediction")
|
||||||
|
predict(example_sentence, discriminator, idx2class,
|
||||||
|
cached=cached, device=device)
|
||||||
|
|
||||||
|
if save_model:
|
||||||
|
# torch.save(discriminator.state_dict(),
|
||||||
|
# "{}_discriminator_{}.pt".format(
|
||||||
|
# args.dataset, epoch + 1
|
||||||
|
# ))
|
||||||
|
torch.save(discriminator.get_classifier().state_dict(),
|
||||||
|
"{}_classifier_head_epoch_{}.pt".format(dataset,
|
||||||
|
epoch + 1))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Train a discriminator on top of GPT-2 representations")
|
||||||
|
parser.add_argument("--dataset", type=str, default="SST",
|
||||||
|
choices=("SST", "clickbait", "toxic", "generic"),
|
||||||
|
help="dataset to train the discriminator on."
|
||||||
|
"In case of generic, the dataset is expected"
|
||||||
|
"to be a TSBV file with structure: class \\t text")
|
||||||
|
parser.add_argument("--dataset_fp", type=str, default="",
|
||||||
|
help="File path of the dataset to use. "
|
||||||
|
"Needed only in case of generic datadset")
|
||||||
|
parser.add_argument("--pretrained_model", type=str, default="gpt2-medium",
|
||||||
|
help="Pretrained model to use as encoder")
|
||||||
|
parser.add_argument("--epochs", type=int, default=10, metavar="N",
|
||||||
|
help="Number of training epochs")
|
||||||
|
parser.add_argument("--batch_size", type=int, default=64, metavar="N",
|
||||||
|
help="input batch size for training (default: 64)")
|
||||||
|
parser.add_argument("--log_interval", type=int, default=10, metavar="N",
|
||||||
|
help="how many batches to wait before logging training status")
|
||||||
|
parser.add_argument("--save_model", action="store_true",
|
||||||
|
help="whether to save the model")
|
||||||
|
parser.add_argument("--cached", action="store_true",
|
||||||
|
help="whether to cache the input representations")
|
||||||
|
parser.add_argument("--no_cuda", action="store_true",
|
||||||
|
help="use to turn off cuda")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
train_discriminator(**(vars(args)))
|
||||||
@@ -22,6 +22,7 @@ import glob
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import json
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@@ -47,7 +48,11 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig,
|
DistilBertConfig,
|
||||||
DistilBertForSequenceClassification,
|
DistilBertForSequenceClassification,
|
||||||
DistilBertTokenizer)
|
DistilBertTokenizer,
|
||||||
|
AlbertConfig,
|
||||||
|
AlbertForSequenceClassification,
|
||||||
|
AlbertTokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
@@ -66,7 +71,8 @@ MODEL_CLASSES = {
|
|||||||
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
||||||
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
|
||||||
|
'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -99,6 +105,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
|
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
@@ -170,15 +177,23 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||||
# Log metrics
|
logs = {}
|
||||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
results = evaluate(args, model, tokenizer)
|
results = evaluate(args, model, tokenizer)
|
||||||
for key, value in results.items():
|
for key, value in results.items():
|
||||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
eval_key = 'eval_{}'.format(key)
|
||||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
logs[eval_key] = value
|
||||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
|
||||||
|
loss_scalar = (tr_loss - logging_loss) / args.logging_steps
|
||||||
|
learning_rate_scalar = scheduler.get_lr()[0]
|
||||||
|
logs['learning_rate'] = learning_rate_scalar
|
||||||
|
logs['loss'] = loss_scalar
|
||||||
logging_loss = tr_loss
|
logging_loss = tr_loss
|
||||||
|
|
||||||
|
for key, value in logs.items():
|
||||||
|
tb_writer.add_scalar(key, value, global_step)
|
||||||
|
print(json.dumps({**logs, **{'step': global_step}}))
|
||||||
|
|
||||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||||
# Save model checkpoint
|
# Save model checkpoint
|
||||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||||
@@ -216,7 +231,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
# multi-gpu eval
|
# multi-gpu eval
|
||||||
@@ -317,7 +332,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||||
elif output_mode == "regression":
|
elif output_mode == "regression":
|
||||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||||
|
|
||||||
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
@@ -361,7 +376,7 @@ def main():
|
|||||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||||
help="Batch size per GPU/CPU for evaluation.")
|
help="Batch size per GPU/CPU for evaluation.")
|
||||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||||
help="The initial learning rate for Adam.")
|
help="The initial learning rate for Adam.")
|
||||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||||
|
|||||||
@@ -47,7 +47,8 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
|
|||||||
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
||||||
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
||||||
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
|
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
|
||||||
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
|
||||||
|
CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -58,7 +59,8 @@ MODEL_CLASSES = {
|
|||||||
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||||
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||||
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||||
|
'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -68,7 +70,7 @@ class TextDataset(Dataset):
|
|||||||
directory, filename = os.path.split(file_path)
|
directory, filename = os.path.split(file_path)
|
||||||
cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)
|
cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)
|
||||||
|
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
with open(cached_features_file, 'rb') as handle:
|
with open(cached_features_file, 'rb') as handle:
|
||||||
self.examples = pickle.load(handle)
|
self.examples = pickle.load(handle)
|
||||||
@@ -215,6 +217,10 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
tr_loss, logging_loss = 0.0, 0.0
|
tr_loss, logging_loss = 0.0, 0.0
|
||||||
|
|
||||||
|
model_to_resize = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
|
model_to_resize.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||||
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
||||||
@@ -297,7 +303,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
# multi-gpu evaluate
|
# multi-gpu evaluate
|
||||||
@@ -431,7 +437,7 @@ def main():
|
|||||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
|
if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
|
||||||
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||||
"flag (masked language modeling).")
|
"flag (masked language modeling).")
|
||||||
if args.eval_data_file is None and args.do_eval:
|
if args.eval_data_file is None and args.do_eval:
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
|
|||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
# multi-gpu evaluate
|
# multi-gpu evaluate
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
|||||||
"attention_mask": batch[1],
|
"attention_mask": batch[1],
|
||||||
"labels": batch[3]}
|
"labels": batch[3]}
|
||||||
if args.model_type != "distilbert":
|
if args.model_type != "distilbert":
|
||||||
inputs["token_type_ids"]: batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
||||||
|
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
||||||
@@ -217,7 +217,7 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
|
|||||||
"attention_mask": batch[1],
|
"attention_mask": batch[1],
|
||||||
"labels": batch[3]}
|
"labels": batch[3]}
|
||||||
if args.model_type != "distilbert":
|
if args.model_type != "distilbert":
|
||||||
inputs["token_type_ids"]: batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
tmp_eval_loss, logits = outputs[:2]
|
tmp_eval_loss, logits = outputs[:2]
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLMTokenizer, XLNetConfig,
|
XLMTokenizer, XLNetConfig,
|
||||||
XLNetForQuestionAnswering,
|
XLNetForQuestionAnswering,
|
||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
|
||||||
|
AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
|
from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
|
||||||
|
|
||||||
@@ -56,7 +57,8 @@ MODEL_CLASSES = {
|
|||||||
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
||||||
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||||
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
|
||||||
|
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
def set_seed(args):
|
def set_seed(args):
|
||||||
@@ -121,7 +123,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||||
logger.info(" Total optimization steps = %d", t_total)
|
logger.info(" Total optimization steps = %d", t_total)
|
||||||
|
|
||||||
global_step = 0
|
global_step = 1
|
||||||
tr_loss, logging_loss = 0.0, 0.0
|
tr_loss, logging_loss = 0.0, 0.0
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||||
@@ -214,7 +216,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
|
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
|
eval_sampler = SequentialSampler(dataset)
|
||||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
# multi-gpu evaluate
|
# multi-gpu evaluate
|
||||||
@@ -558,7 +560,7 @@ def main():
|
|||||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||||
|
|
||||||
# Load a trained model and vocabulary that you have fine-tuned
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
model = model_class.from_pretrained(args.output_dir)
|
model = model_class.from_pretrained(args.output_dir, force_download=True)
|
||||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
model.to(args.device)
|
model.to(args.device)
|
||||||
|
|
||||||
@@ -576,7 +578,7 @@ def main():
|
|||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
# Reload the model
|
# Reload the model
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
model = model_class.from_pretrained(checkpoint)
|
model = model_class.from_pretrained(checkpoint, force_download=True)
|
||||||
model.to(args.device)
|
model.to(args.device)
|
||||||
|
|
||||||
# Evaluate
|
# Evaluate
|
||||||
|
|||||||
615
examples/run_tf_ner.py
Normal file
615
examples/run_tf_ner.py
Normal file
@@ -0,0 +1,615 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
import glob
|
||||||
|
import re
|
||||||
|
import tensorflow as tf
|
||||||
|
import collections
|
||||||
|
import numpy as np
|
||||||
|
from seqeval import metrics
|
||||||
|
import _pickle as pickle
|
||||||
|
from absl import logging
|
||||||
|
from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification
|
||||||
|
from transformers import RobertaConfig, RobertaTokenizer, TFRobertaForTokenClassification
|
||||||
|
from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||||
|
from transformers import create_optimizer, GradientAccumulator
|
||||||
|
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
||||||
|
from fastprogress import master_bar, progress_bar
|
||||||
|
from absl import flags
|
||||||
|
from absl import app
|
||||||
|
|
||||||
|
|
||||||
|
ALL_MODELS = sum(
|
||||||
|
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
|
||||||
|
())
|
||||||
|
|
||||||
|
MODEL_CLASSES = {
|
||||||
|
"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
|
||||||
|
"roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
|
||||||
|
"distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"data_dir", None,
|
||||||
|
"The input data dir. Should contain the .conll files (or other data files) "
|
||||||
|
"for the task.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"model_type", None,
|
||||||
|
"Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"model_name_or_path", None,
|
||||||
|
"Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"output_dir", None,
|
||||||
|
"The output directory where the model checkpoints will be written.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"labels", "",
|
||||||
|
"Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"config_name", "",
|
||||||
|
"Pretrained config name or path if not the same as model_name")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"tokenizer_name", "",
|
||||||
|
"Pretrained tokenizer name or path if not the same as model_name")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"cache_dir", "",
|
||||||
|
"Where do you want to store the pre-trained models downloaded from s3")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"max_seq_length", 128,
|
||||||
|
"The maximum total input sentence length after tokenization. "
|
||||||
|
"Sequences longer than this will be truncated, sequences shorter "
|
||||||
|
"will be padded.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"tpu", None,
|
||||||
|
"The Cloud TPU to use for training. This should be either the name "
|
||||||
|
"used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
|
||||||
|
"url.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"num_tpu_cores", 8,
|
||||||
|
"Total number of TPU cores to use.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_train", False,
|
||||||
|
"Whether to run training.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_eval", False,
|
||||||
|
"Whether to run eval on the dev set.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_predict", False,
|
||||||
|
"Whether to run predictions on the test set.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"evaluate_during_training", False,
|
||||||
|
"Whether to run evaluation during training at each logging step.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_lower_case", False,
|
||||||
|
"Set this flag if you are using an uncased model.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"per_device_train_batch_size", 8,
|
||||||
|
"Batch size per GPU/CPU/TPU for training.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"per_device_eval_batch_size", 8,
|
||||||
|
"Batch size per GPU/CPU/TPU for evaluation.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"gradient_accumulation_steps", 1,
|
||||||
|
"Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"learning_rate", 5e-5,
|
||||||
|
"The initial learning rate for Adam.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"weight_decay", 0.0,
|
||||||
|
"Weight decay if we apply some.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"adam_epsilon", 1e-8,
|
||||||
|
"Epsilon for Adam optimizer.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"max_grad_norm", 1.0,
|
||||||
|
"Max gradient norm.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"num_train_epochs", 3,
|
||||||
|
"Total number of training epochs to perform.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"max_steps", -1,
|
||||||
|
"If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"warmup_steps", 0,
|
||||||
|
"Linear warmup over warmup_steps.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"logging_steps", 50,
|
||||||
|
"Log every X updates steps.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"save_steps", 50,
|
||||||
|
"Save checkpoint every X updates steps.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"eval_all_checkpoints", False,
|
||||||
|
"Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"no_cuda", False,
|
||||||
|
"Avoid using CUDA when available")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"overwrite_output_dir", False,
|
||||||
|
"Overwrite the content of the output directory")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"overwrite_cache", False,
|
||||||
|
"Overwrite the cached training and evaluation sets")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"seed", 42,
|
||||||
|
"random seed for initialization")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"fp16", False,
|
||||||
|
"Whether to use 16-bit (mixed) precision instead of 32-bit")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"gpus", "0",
|
||||||
|
"Comma separated list of gpus devices. If only one, switch to single "
|
||||||
|
"gpu strategy, if None takes all the gpus available.")
|
||||||
|
|
||||||
|
|
||||||
|
def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
|
||||||
|
if args['max_steps'] > 0:
|
||||||
|
num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
|
||||||
|
args['num_train_epochs'] = 1
|
||||||
|
else:
|
||||||
|
num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
|
||||||
|
|
||||||
|
writer = tf.summary.create_file_writer("/tmp/mylogs")
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||||
|
optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
|
||||||
|
|
||||||
|
if args['fp16']:
|
||||||
|
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
|
||||||
|
|
||||||
|
loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
|
||||||
|
gradient_accumulator = GradientAccumulator()
|
||||||
|
|
||||||
|
logging.info("***** Running training *****")
|
||||||
|
logging.info(" Num examples = %d", num_train_examples)
|
||||||
|
logging.info(" Num Epochs = %d", args['num_train_epochs'])
|
||||||
|
logging.info(" Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
|
||||||
|
logging.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||||
|
train_batch_size * args['gradient_accumulation_steps'])
|
||||||
|
logging.info(" Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
|
||||||
|
logging.info(" Total training steps = %d", num_train_steps)
|
||||||
|
|
||||||
|
model.summary()
|
||||||
|
|
||||||
|
@tf.function
|
||||||
|
def apply_gradients():
|
||||||
|
grads_and_vars = []
|
||||||
|
|
||||||
|
for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
|
||||||
|
if gradient is not None:
|
||||||
|
scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
|
||||||
|
grads_and_vars.append((scaled_gradient, variable))
|
||||||
|
else:
|
||||||
|
grads_and_vars.append((gradient, variable))
|
||||||
|
|
||||||
|
optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
|
||||||
|
gradient_accumulator.reset()
|
||||||
|
|
||||||
|
@tf.function
|
||||||
|
def train_step(train_features, train_labels):
|
||||||
|
def step_fn(train_features, train_labels):
|
||||||
|
inputs = {'attention_mask': train_features['input_mask'], 'training': True}
|
||||||
|
|
||||||
|
if args['model_type'] != "distilbert":
|
||||||
|
inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||||
|
|
||||||
|
with tf.GradientTape() as tape:
|
||||||
|
logits = model(train_features['input_ids'], **inputs)[0]
|
||||||
|
logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||||
|
active_loss = tf.reshape(train_features['input_mask'], (-1,))
|
||||||
|
active_logits = tf.boolean_mask(logits, active_loss)
|
||||||
|
train_labels = tf.reshape(train_labels, (-1,))
|
||||||
|
active_labels = tf.boolean_mask(train_labels, active_loss)
|
||||||
|
cross_entropy = loss_fct(active_labels, active_logits)
|
||||||
|
loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
|
||||||
|
grads = tape.gradient(loss, model.trainable_variables)
|
||||||
|
|
||||||
|
gradient_accumulator(grads)
|
||||||
|
|
||||||
|
return cross_entropy
|
||||||
|
|
||||||
|
per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
|
||||||
|
mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
|
||||||
|
|
||||||
|
return mean_loss
|
||||||
|
|
||||||
|
current_time = datetime.datetime.now()
|
||||||
|
train_iterator = master_bar(range(args['num_train_epochs']))
|
||||||
|
global_step = 0
|
||||||
|
logging_loss = 0.0
|
||||||
|
|
||||||
|
for epoch in train_iterator:
|
||||||
|
epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
|
||||||
|
step = 1
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
for train_features, train_labels in epoch_iterator:
|
||||||
|
loss = train_step(train_features, train_labels)
|
||||||
|
|
||||||
|
if step % args['gradient_accumulation_steps'] == 0:
|
||||||
|
strategy.experimental_run_v2(apply_gradients)
|
||||||
|
|
||||||
|
loss_metric(loss)
|
||||||
|
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
|
||||||
|
# Log metrics
|
||||||
|
if args['n_device'] == 1 and args['evaluate_during_training']: # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
|
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
logging.info("Eval at step " + str(global_step) + "\n" + report)
|
||||||
|
logging.info("eval_loss: " + str(eval_loss))
|
||||||
|
|
||||||
|
precision = metrics.precision_score(y_true, y_pred)
|
||||||
|
recall = metrics.recall_score(y_true, y_pred)
|
||||||
|
f1 = metrics.f1_score(y_true, y_pred)
|
||||||
|
|
||||||
|
with writer.as_default():
|
||||||
|
tf.summary.scalar("eval_loss", eval_loss, global_step)
|
||||||
|
tf.summary.scalar("precision", precision, global_step)
|
||||||
|
tf.summary.scalar("recall", recall, global_step)
|
||||||
|
tf.summary.scalar("f1", f1, global_step)
|
||||||
|
|
||||||
|
lr = optimizer.learning_rate
|
||||||
|
learning_rate = lr(step)
|
||||||
|
|
||||||
|
with writer.as_default():
|
||||||
|
tf.summary.scalar("lr", learning_rate, global_step)
|
||||||
|
tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
|
||||||
|
|
||||||
|
logging_loss = loss_metric.result()
|
||||||
|
|
||||||
|
with writer.as_default():
|
||||||
|
tf.summary.scalar("loss", loss_metric.result(), step=step)
|
||||||
|
|
||||||
|
if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
|
||||||
|
# Save model checkpoint
|
||||||
|
output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
|
||||||
|
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
model.save_pretrained(output_dir)
|
||||||
|
logging.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
|
||||||
|
train_iterator.child.comment = f'loss : {loss_metric.result()}'
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
|
||||||
|
|
||||||
|
loss_metric.reset_states()
|
||||||
|
|
||||||
|
logging.info(" Training took time = {}".format(datetime.datetime.now() - current_time))
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
|
||||||
|
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||||
|
eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
|
||||||
|
eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
|
||||||
|
preds = None
|
||||||
|
num_eval_steps = math.ceil(size / eval_batch_size)
|
||||||
|
master = master_bar(range(1))
|
||||||
|
eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
|
||||||
|
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||||
|
loss = 0.0
|
||||||
|
|
||||||
|
logging.info("***** Running evaluation *****")
|
||||||
|
logging.info(" Num examples = %d", size)
|
||||||
|
logging.info(" Batch size = %d", eval_batch_size)
|
||||||
|
|
||||||
|
for eval_features, eval_labels in eval_iterator:
|
||||||
|
inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
|
||||||
|
|
||||||
|
if args['model_type'] != "distilbert":
|
||||||
|
inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
logits = model(eval_features['input_ids'], **inputs)[0]
|
||||||
|
tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||||
|
active_loss = tf.reshape(eval_features['input_mask'], (-1,))
|
||||||
|
active_logits = tf.boolean_mask(tmp_logits, active_loss)
|
||||||
|
tmp_eval_labels = tf.reshape(eval_labels, (-1,))
|
||||||
|
active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
|
||||||
|
cross_entropy = loss_fct(active_labels, active_logits)
|
||||||
|
loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
|
||||||
|
|
||||||
|
if preds is None:
|
||||||
|
preds = logits.numpy()
|
||||||
|
label_ids = eval_labels.numpy()
|
||||||
|
else:
|
||||||
|
preds = np.append(preds, logits.numpy(), axis=0)
|
||||||
|
label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
|
||||||
|
|
||||||
|
preds = np.argmax(preds, axis=2)
|
||||||
|
y_pred = [[] for _ in range(label_ids.shape[0])]
|
||||||
|
y_true = [[] for _ in range(label_ids.shape[0])]
|
||||||
|
loss = loss / num_eval_steps
|
||||||
|
|
||||||
|
for i in range(label_ids.shape[0]):
|
||||||
|
for j in range(label_ids.shape[1]):
|
||||||
|
if label_ids[i, j] != pad_token_label_id:
|
||||||
|
y_pred[i].append(labels[preds[i, j] - 1])
|
||||||
|
y_true[i].append(labels[label_ids[i, j] - 1])
|
||||||
|
|
||||||
|
return y_true, y_pred, loss.numpy()
|
||||||
|
|
||||||
|
|
||||||
|
def load_cache(cached_file, max_seq_length):
|
||||||
|
name_to_features = {
|
||||||
|
"input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
"input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
"segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
"label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _decode_record(record):
|
||||||
|
example = tf.io.parse_single_example(record, name_to_features)
|
||||||
|
features = {}
|
||||||
|
features['input_ids'] = example['input_ids']
|
||||||
|
features['input_mask'] = example['input_mask']
|
||||||
|
features['segment_ids'] = example['segment_ids']
|
||||||
|
|
||||||
|
return features, example['label_ids']
|
||||||
|
|
||||||
|
d = tf.data.TFRecordDataset(cached_file)
|
||||||
|
d = d.map(_decode_record, num_parallel_calls=4)
|
||||||
|
count = d.reduce(0, lambda x, _: x + 1)
|
||||||
|
|
||||||
|
return d, count.numpy()
|
||||||
|
|
||||||
|
|
||||||
|
def save_cache(features, cached_features_file):
|
||||||
|
writer = tf.io.TFRecordWriter(cached_features_file)
|
||||||
|
|
||||||
|
for (ex_index, feature) in enumerate(features):
|
||||||
|
if ex_index % 5000 == 0:
|
||||||
|
logging.info("Writing example %d of %d" % (ex_index, len(features)))
|
||||||
|
|
||||||
|
def create_int_feature(values):
|
||||||
|
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
||||||
|
return f
|
||||||
|
|
||||||
|
record_feature = collections.OrderedDict()
|
||||||
|
record_feature["input_ids"] = create_int_feature(feature.input_ids)
|
||||||
|
record_feature["input_mask"] = create_int_feature(feature.input_mask)
|
||||||
|
record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
|
||||||
|
record_feature["label_ids"] = create_int_feature(feature.label_ids)
|
||||||
|
|
||||||
|
tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
|
||||||
|
|
||||||
|
writer.write(tf_example.SerializeToString())
|
||||||
|
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
|
||||||
|
drop_remainder = True if args['tpu'] or mode == 'train' else False
|
||||||
|
|
||||||
|
# Load data features from cache or dataset file
|
||||||
|
cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
|
||||||
|
list(filter(None, args['model_name_or_path'].split("/"))).pop(),
|
||||||
|
str(args['max_seq_length'])))
|
||||||
|
if os.path.exists(cached_features_file) and not args['overwrite_cache']:
|
||||||
|
logging.info("Loading features from cached file %s", cached_features_file)
|
||||||
|
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||||
|
else:
|
||||||
|
logging.info("Creating features from dataset file at %s", args['data_dir'])
|
||||||
|
examples = read_examples_from_file(args['data_dir'], mode)
|
||||||
|
features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
|
||||||
|
cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
|
||||||
|
# xlnet has a cls token at the end
|
||||||
|
cls_token=tokenizer.cls_token,
|
||||||
|
cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
|
||||||
|
sep_token=tokenizer.sep_token,
|
||||||
|
sep_token_extra=bool(args['model_type'] in ["roberta"]),
|
||||||
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
|
pad_on_left=bool(args['model_type'] in ["xlnet"]),
|
||||||
|
# pad on the left for xlnet
|
||||||
|
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||||
|
pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
|
||||||
|
pad_token_label_id=pad_token_label_id
|
||||||
|
)
|
||||||
|
logging.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
save_cache(features, cached_features_file)
|
||||||
|
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||||
|
|
||||||
|
if mode == 'train':
|
||||||
|
dataset = dataset.repeat()
|
||||||
|
dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
|
||||||
|
|
||||||
|
dataset = dataset.batch(batch_size, drop_remainder)
|
||||||
|
dataset = dataset.prefetch(buffer_size=batch_size)
|
||||||
|
|
||||||
|
return dataset, size
|
||||||
|
|
||||||
|
|
||||||
|
def main(_):
|
||||||
|
logging.set_verbosity(logging.INFO)
|
||||||
|
args = flags.FLAGS.flag_values_dict()
|
||||||
|
|
||||||
|
if os.path.exists(args['output_dir']) and os.listdir(
|
||||||
|
args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
|
||||||
|
raise ValueError(
|
||||||
|
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||||
|
args['output_dir']))
|
||||||
|
|
||||||
|
if args['fp16']:
|
||||||
|
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
|
||||||
|
|
||||||
|
if args['tpu']:
|
||||||
|
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
|
||||||
|
tf.config.experimental_connect_to_cluster(resolver)
|
||||||
|
tf.tpu.experimental.initialize_tpu_system(resolver)
|
||||||
|
strategy = tf.distribute.experimental.TPUStrategy(resolver)
|
||||||
|
args['n_device'] = args['num_tpu_cores']
|
||||||
|
elif len(args['gpus'].split(',')) > 1:
|
||||||
|
args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||||
|
strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||||
|
elif args['no_cuda']:
|
||||||
|
args['n_device'] = 1
|
||||||
|
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
|
||||||
|
else:
|
||||||
|
args['n_device'] = len(args['gpus'].split(','))
|
||||||
|
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
|
||||||
|
|
||||||
|
logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
|
||||||
|
args['n_device'], bool(args['n_device'] > 1), args['fp16'])
|
||||||
|
|
||||||
|
labels = get_labels(args['labels'])
|
||||||
|
num_labels = len(labels) + 1
|
||||||
|
pad_token_label_id = 0
|
||||||
|
config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
|
||||||
|
config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
|
||||||
|
num_labels=num_labels,
|
||||||
|
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||||
|
|
||||||
|
logging.info("Training/evaluation parameters %s", args)
|
||||||
|
|
||||||
|
# Training
|
||||||
|
if args['do_train']:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
|
||||||
|
do_lower_case=args['do_lower_case'],
|
||||||
|
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
model = model_class.from_pretrained(args['model_name_or_path'],
|
||||||
|
from_pt=bool(".bin" in args['model_name_or_path']),
|
||||||
|
config=config,
|
||||||
|
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||||
|
model.layers[-1].activation = tf.keras.activations.softmax
|
||||||
|
|
||||||
|
train_batch_size = args['per_device_train_batch_size'] * args['n_device']
|
||||||
|
train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
|
||||||
|
train_dataset = strategy.experimental_distribute_dataset(train_dataset)
|
||||||
|
train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
|
||||||
|
|
||||||
|
if not os.path.exists(args['output_dir']):
|
||||||
|
os.makedirs(args['output_dir'])
|
||||||
|
|
||||||
|
logging.info("Saving model to %s", args['output_dir'])
|
||||||
|
|
||||||
|
model.save_pretrained(args['output_dir'])
|
||||||
|
tokenizer.save_pretrained(args['output_dir'])
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
if args['do_eval']:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||||
|
checkpoints = []
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if args['eval_all_checkpoints']:
|
||||||
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
|
||||||
|
|
||||||
|
logging.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
|
||||||
|
if len(checkpoints) == 0:
|
||||||
|
checkpoints.append(args['output_dir'])
|
||||||
|
|
||||||
|
for checkpoint in checkpoints:
|
||||||
|
global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
model = model_class.from_pretrained(checkpoint)
|
||||||
|
|
||||||
|
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
if global_step:
|
||||||
|
results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
|
||||||
|
|
||||||
|
output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
|
||||||
|
|
||||||
|
with tf.io.gfile.GFile(output_eval_file, "w") as writer:
|
||||||
|
for res in results:
|
||||||
|
for key, val in res.items():
|
||||||
|
if "loss" in key:
|
||||||
|
logging.info(key + " = " + str(val))
|
||||||
|
writer.write(key + " = " + str(val))
|
||||||
|
writer.write("\n")
|
||||||
|
else:
|
||||||
|
logging.info(key)
|
||||||
|
logging.info("\n" + report)
|
||||||
|
writer.write(key + "\n")
|
||||||
|
writer.write(report)
|
||||||
|
writer.write("\n")
|
||||||
|
|
||||||
|
if args['do_predict']:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||||
|
model = model_class.from_pretrained(args['output_dir'])
|
||||||
|
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||||
|
predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
|
||||||
|
y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
|
||||||
|
output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
|
||||||
|
output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
logging.info("\n" + report)
|
||||||
|
|
||||||
|
writer.write(report)
|
||||||
|
writer.write("\n\nloss = " + str(pred_loss))
|
||||||
|
|
||||||
|
with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
|
||||||
|
with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
|
||||||
|
example_id = 0
|
||||||
|
|
||||||
|
for line in f:
|
||||||
|
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||||
|
writer.write(line)
|
||||||
|
|
||||||
|
if not y_pred[example_id]:
|
||||||
|
example_id += 1
|
||||||
|
elif y_pred[example_id]:
|
||||||
|
output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
|
||||||
|
writer.write(output_line)
|
||||||
|
else:
|
||||||
|
logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
flags.mark_flag_as_required("data_dir")
|
||||||
|
flags.mark_flag_as_required("output_dir")
|
||||||
|
flags.mark_flag_as_required("model_name_or_path")
|
||||||
|
flags.mark_flag_as_required("model_type")
|
||||||
|
app.run(main)
|
||||||
515
examples/run_xnli.py
Normal file
515
examples/run_xnli.py
Normal file
@@ -0,0 +1,515 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
|
||||||
|
Adapted from `examples/run_glue.py`"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||||
|
TensorDataset)
|
||||||
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
|
try:
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
except:
|
||||||
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
|
from transformers import (WEIGHTS_NAME,
|
||||||
|
BertConfig, BertForSequenceClassification, BertTokenizer,
|
||||||
|
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
||||||
|
DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||||
|
|
||||||
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
|
from transformers import xnli_compute_metrics as compute_metrics
|
||||||
|
from transformers import xnli_output_modes as output_modes
|
||||||
|
from transformers import xnli_processors as processors
|
||||||
|
|
||||||
|
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ())
|
||||||
|
|
||||||
|
MODEL_CLASSES = {
|
||||||
|
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||||
|
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def set_seed(args):
|
||||||
|
random.seed(args.seed)
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
if args.n_gpu > 0:
|
||||||
|
torch.cuda.manual_seed_all(args.seed)
|
||||||
|
|
||||||
|
|
||||||
|
def train(args, train_dataset, model, tokenizer):
|
||||||
|
""" Train the model """
|
||||||
|
if args.local_rank in [-1, 0]:
|
||||||
|
tb_writer = SummaryWriter()
|
||||||
|
|
||||||
|
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||||
|
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||||
|
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||||
|
|
||||||
|
if args.max_steps > 0:
|
||||||
|
t_total = args.max_steps
|
||||||
|
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||||
|
else:
|
||||||
|
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||||
|
|
||||||
|
# Prepare optimizer and schedule (linear warmup and decay)
|
||||||
|
no_decay = ['bias', 'LayerNorm.weight']
|
||||||
|
optimizer_grouped_parameters = [
|
||||||
|
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||||
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
|
]
|
||||||
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
|
if args.fp16:
|
||||||
|
try:
|
||||||
|
from apex import amp
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||||
|
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||||
|
|
||||||
|
# multi-gpu training (should be after apex fp16 initialization)
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
|
# Distributed training (should be after apex fp16 initialization)
|
||||||
|
if args.local_rank != -1:
|
||||||
|
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||||
|
output_device=args.local_rank,
|
||||||
|
find_unused_parameters=True)
|
||||||
|
|
||||||
|
# Train!
|
||||||
|
logger.info("***** Running training *****")
|
||||||
|
logger.info(" Num examples = %d", len(train_dataset))
|
||||||
|
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||||
|
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||||
|
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||||
|
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||||
|
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||||
|
logger.info(" Total optimization steps = %d", t_total)
|
||||||
|
|
||||||
|
global_step = 0
|
||||||
|
tr_loss, logging_loss = 0.0, 0.0
|
||||||
|
model.zero_grad()
|
||||||
|
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||||
|
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||||
|
for _ in train_iterator:
|
||||||
|
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||||
|
for step, batch in enumerate(epoch_iterator):
|
||||||
|
model.train()
|
||||||
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
|
inputs = {'input_ids': batch[0],
|
||||||
|
'attention_mask': batch[1],
|
||||||
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids
|
||||||
|
outputs = model(**inputs)
|
||||||
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
|
if args.gradient_accumulation_steps > 1:
|
||||||
|
loss = loss / args.gradient_accumulation_steps
|
||||||
|
|
||||||
|
if args.fp16:
|
||||||
|
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||||
|
scaled_loss.backward()
|
||||||
|
else:
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
tr_loss += loss.item()
|
||||||
|
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||||
|
if args.fp16:
|
||||||
|
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||||
|
else:
|
||||||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||||
|
|
||||||
|
optimizer.step()
|
||||||
|
scheduler.step() # Update learning rate schedule
|
||||||
|
model.zero_grad()
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||||
|
# Log metrics
|
||||||
|
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
|
results = evaluate(args, model, tokenizer)
|
||||||
|
for key, value in results.items():
|
||||||
|
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||||
|
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||||
|
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||||
|
logging_loss = tr_loss
|
||||||
|
|
||||||
|
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||||
|
# Save model checkpoint
|
||||||
|
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
|
model_to_save.save_pretrained(output_dir)
|
||||||
|
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||||
|
logger.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
|
||||||
|
if args.max_steps > 0 and global_step > args.max_steps:
|
||||||
|
epoch_iterator.close()
|
||||||
|
break
|
||||||
|
if args.max_steps > 0 and global_step > args.max_steps:
|
||||||
|
train_iterator.close()
|
||||||
|
break
|
||||||
|
|
||||||
|
if args.local_rank in [-1, 0]:
|
||||||
|
tb_writer.close()
|
||||||
|
|
||||||
|
return global_step, tr_loss / global_step
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args, model, tokenizer, prefix=""):
|
||||||
|
eval_task_names = (args.task_name,)
|
||||||
|
eval_outputs_dirs = (args.output_dir,)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
|
||||||
|
eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
|
||||||
|
|
||||||
|
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||||
|
os.makedirs(eval_output_dir)
|
||||||
|
|
||||||
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
|
# Note that DistributedSampler samples randomly
|
||||||
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
|
# multi-gpu eval
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
|
# Eval!
|
||||||
|
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||||
|
logger.info(" Num examples = %d", len(eval_dataset))
|
||||||
|
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||||
|
eval_loss = 0.0
|
||||||
|
nb_eval_steps = 0
|
||||||
|
preds = None
|
||||||
|
out_label_ids = None
|
||||||
|
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||||
|
model.eval()
|
||||||
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
inputs = {'input_ids': batch[0],
|
||||||
|
'attention_mask': batch[1],
|
||||||
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids
|
||||||
|
outputs = model(**inputs)
|
||||||
|
tmp_eval_loss, logits = outputs[:2]
|
||||||
|
|
||||||
|
eval_loss += tmp_eval_loss.mean().item()
|
||||||
|
nb_eval_steps += 1
|
||||||
|
if preds is None:
|
||||||
|
preds = logits.detach().cpu().numpy()
|
||||||
|
out_label_ids = inputs['labels'].detach().cpu().numpy()
|
||||||
|
else:
|
||||||
|
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
|
||||||
|
out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
|
||||||
|
|
||||||
|
eval_loss = eval_loss / nb_eval_steps
|
||||||
|
if args.output_mode == "classification":
|
||||||
|
preds = np.argmax(preds, axis=1)
|
||||||
|
else:
|
||||||
|
raise ValueError('No other `output_mode` for XNLI.')
|
||||||
|
result = compute_metrics(eval_task, preds, out_label_ids)
|
||||||
|
results.update(result)
|
||||||
|
|
||||||
|
output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
logger.info("***** Eval results {} *****".format(prefix))
|
||||||
|
for key in sorted(result.keys()):
|
||||||
|
logger.info(" %s = %s", key, str(result[key]))
|
||||||
|
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||||
|
if args.local_rank not in [-1, 0] and not evaluate:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||||
|
|
||||||
|
processor = processors[task](language=args.language, train_language=args.train_language)
|
||||||
|
output_mode = output_modes[task]
|
||||||
|
# Load data features from cache or dataset file
|
||||||
|
cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
|
||||||
|
'test' if evaluate else 'train',
|
||||||
|
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||||
|
str(args.max_seq_length),
|
||||||
|
str(task),
|
||||||
|
str(args.train_language if (not evaluate and args.train_language is not None) else args.language)))
|
||||||
|
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||||
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
|
features = torch.load(cached_features_file)
|
||||||
|
else:
|
||||||
|
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||||
|
label_list = processor.get_labels()
|
||||||
|
examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||||
|
features = convert_examples_to_features(examples,
|
||||||
|
tokenizer,
|
||||||
|
label_list=label_list,
|
||||||
|
max_length=args.max_seq_length,
|
||||||
|
output_mode=output_mode,
|
||||||
|
pad_on_left=False,
|
||||||
|
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||||
|
pad_token_segment_id=0,
|
||||||
|
)
|
||||||
|
if args.local_rank in [-1, 0]:
|
||||||
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
torch.save(features, cached_features_file)
|
||||||
|
|
||||||
|
if args.local_rank == 0 and not evaluate:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||||
|
|
||||||
|
# Convert to Tensors and build dataset
|
||||||
|
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||||
|
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||||
|
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||||
|
if output_mode == "classification":
|
||||||
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||||
|
else:
|
||||||
|
raise ValueError('No other `output_mode` for XNLI.')
|
||||||
|
|
||||||
|
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||||
|
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||||
|
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||||
|
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||||
|
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||||
|
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||||
|
parser.add_argument("--language", default=None, type=str, required=True,
|
||||||
|
help="Evaluation language. Also train language if `train_language` is set to None.")
|
||||||
|
parser.add_argument("--train_language", default=None, type=str,
|
||||||
|
help="Train language if is different of the evaluation language.")
|
||||||
|
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||||
|
help="The output directory where the model predictions and checkpoints will be written.")
|
||||||
|
|
||||||
|
## Other parameters
|
||||||
|
parser.add_argument("--config_name", default="", type=str,
|
||||||
|
help="Pretrained config name or path if not the same as model_name")
|
||||||
|
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||||
|
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||||
|
parser.add_argument("--cache_dir", default="", type=str,
|
||||||
|
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||||
|
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||||
|
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||||
|
"than this will be truncated, sequences shorter will be padded.")
|
||||||
|
parser.add_argument("--do_train", action='store_true',
|
||||||
|
help="Whether to run training.")
|
||||||
|
parser.add_argument("--do_eval", action='store_true',
|
||||||
|
help="Whether to run eval on the test set.")
|
||||||
|
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||||
|
help="Rul evaluation during training at each logging step.")
|
||||||
|
parser.add_argument("--do_lower_case", action='store_true',
|
||||||
|
help="Set this flag if you are using an uncased model.")
|
||||||
|
|
||||||
|
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||||
|
help="Batch size per GPU/CPU for training.")
|
||||||
|
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||||
|
help="Batch size per GPU/CPU for evaluation.")
|
||||||
|
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||||
|
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
|
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||||
|
help="The initial learning rate for Adam.")
|
||||||
|
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||||
|
help="Weight deay if we apply some.")
|
||||||
|
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||||
|
help="Epsilon for Adam optimizer.")
|
||||||
|
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||||
|
help="Max gradient norm.")
|
||||||
|
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||||
|
help="Total number of training epochs to perform.")
|
||||||
|
parser.add_argument("--max_steps", default=-1, type=int,
|
||||||
|
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||||
|
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||||
|
help="Linear warmup over warmup_steps.")
|
||||||
|
|
||||||
|
parser.add_argument('--logging_steps', type=int, default=50,
|
||||||
|
help="Log every X updates steps.")
|
||||||
|
parser.add_argument('--save_steps', type=int, default=50,
|
||||||
|
help="Save checkpoint every X updates steps.")
|
||||||
|
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||||
|
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||||
|
parser.add_argument("--no_cuda", action='store_true',
|
||||||
|
help="Avoid using CUDA when available")
|
||||||
|
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||||
|
help="Overwrite the content of the output directory")
|
||||||
|
parser.add_argument('--overwrite_cache', action='store_true',
|
||||||
|
help="Overwrite the cached training and evaluation sets")
|
||||||
|
parser.add_argument('--seed', type=int, default=42,
|
||||||
|
help="random seed for initialization")
|
||||||
|
|
||||||
|
parser.add_argument('--fp16', action='store_true',
|
||||||
|
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||||
|
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||||
|
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||||
|
"See details at https://nvidia.github.io/apex/amp.html")
|
||||||
|
parser.add_argument("--local_rank", type=int, default=-1,
|
||||||
|
help="For distributed training: local_rank")
|
||||||
|
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
||||||
|
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||||
|
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||||
|
|
||||||
|
# Setup distant debugging if needed
|
||||||
|
if args.server_ip and args.server_port:
|
||||||
|
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||||
|
import ptvsd
|
||||||
|
print("Waiting for debugger attach")
|
||||||
|
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||||
|
ptvsd.wait_for_attach()
|
||||||
|
|
||||||
|
# Setup CUDA, GPU & distributed training
|
||||||
|
if args.local_rank == -1 or args.no_cuda:
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||||
|
args.n_gpu = torch.cuda.device_count()
|
||||||
|
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||||
|
torch.cuda.set_device(args.local_rank)
|
||||||
|
device = torch.device("cuda", args.local_rank)
|
||||||
|
torch.distributed.init_process_group(backend='nccl')
|
||||||
|
args.n_gpu = 1
|
||||||
|
args.device = device
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||||
|
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||||
|
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||||
|
|
||||||
|
# Set seed
|
||||||
|
set_seed(args)
|
||||||
|
|
||||||
|
# Prepare XNLI task
|
||||||
|
args.task_name = 'xnli'
|
||||||
|
if args.task_name not in processors:
|
||||||
|
raise ValueError("Task not found: %s" % (args.task_name))
|
||||||
|
processor = processors[args.task_name](language=args.language, train_language=args.train_language)
|
||||||
|
args.output_mode = output_modes[args.task_name]
|
||||||
|
label_list = processor.get_labels()
|
||||||
|
num_labels = len(label_list)
|
||||||
|
|
||||||
|
# Load pretrained model and tokenizer
|
||||||
|
if args.local_rank not in [-1, 0]:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||||
|
|
||||||
|
args.model_type = args.model_type.lower()
|
||||||
|
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||||
|
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||||
|
num_labels=num_labels,
|
||||||
|
finetuning_task=args.task_name,
|
||||||
|
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||||
|
do_lower_case=args.do_lower_case,
|
||||||
|
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||||
|
model = model_class.from_pretrained(args.model_name_or_path,
|
||||||
|
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||||
|
config=config,
|
||||||
|
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||||
|
|
||||||
|
if args.local_rank == 0:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||||
|
|
||||||
|
model.to(args.device)
|
||||||
|
|
||||||
|
logger.info("Training/evaluation parameters %s", args)
|
||||||
|
|
||||||
|
|
||||||
|
# Training
|
||||||
|
if args.do_train:
|
||||||
|
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
|
||||||
|
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||||
|
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||||
|
|
||||||
|
|
||||||
|
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||||
|
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||||
|
# Create output directory if needed
|
||||||
|
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||||
|
os.makedirs(args.output_dir)
|
||||||
|
|
||||||
|
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||||
|
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||||
|
# They can then be reloaded using `from_pretrained()`
|
||||||
|
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
|
model_to_save.save_pretrained(args.output_dir)
|
||||||
|
tokenizer.save_pretrained(args.output_dir)
|
||||||
|
|
||||||
|
# Good practice: save your training arguments together with the trained model
|
||||||
|
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||||
|
|
||||||
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
|
model = model_class.from_pretrained(args.output_dir)
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||||
|
model.to(args.device)
|
||||||
|
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
results = {}
|
||||||
|
if args.do_eval and args.local_rank in [-1, 0]:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
|
checkpoints = [args.output_dir]
|
||||||
|
if args.eval_all_checkpoints:
|
||||||
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
for checkpoint in checkpoints:
|
||||||
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
|
||||||
|
|
||||||
|
model = model_class.from_pretrained(checkpoint)
|
||||||
|
model.to(args.device)
|
||||||
|
result = evaluate(args, model, tokenizer, prefix=prefix)
|
||||||
|
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||||
|
results.update(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
13
setup.py
13
setup.py
@@ -36,9 +36,15 @@ To create the package for pypi.
|
|||||||
from io import open
|
from io import open
|
||||||
from setuptools import find_packages, setup
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
|
||||||
|
extras = {
|
||||||
|
'serving': ['uvicorn', 'fastapi']
|
||||||
|
}
|
||||||
|
extras['all'] = [package for package in extras.values()]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="transformers",
|
name="transformers",
|
||||||
version="2.1.1",
|
version="2.2.1",
|
||||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||||
author_email="thomas@huggingface.co",
|
author_email="thomas@huggingface.co",
|
||||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||||
@@ -61,8 +67,11 @@ setup(
|
|||||||
"transformers=transformers.__main__:main",
|
"transformers=transformers.__main__:main",
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
extras_require=extras,
|
||||||
|
scripts=[
|
||||||
|
'transformers-cli'
|
||||||
|
],
|
||||||
# python_requires='>=3.5.0',
|
# python_requires='>=3.5.0',
|
||||||
tests_require=['pytest'],
|
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Intended Audience :: Science/Research',
|
'Intended Audience :: Science/Research',
|
||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ import numpy as np
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from .configuration_xxx import XxxConfig
|
from .configuration_xxx import XxxConfig
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||||
from .file_utils import add_start_docstrings
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -121,9 +121,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
|
|||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = tf.fill(tf.shape(input_ids), 1)
|
attention_mask = tf.fill(shape_list(input_ids), 1)
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
token_type_ids = tf.fill(tf.shape(input_ids), 0)
|
token_type_ids = tf.fill(shape_list(input_ids), 0)
|
||||||
|
|
||||||
# We create a 3D attention mask from a 2D tensor mask.
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import XxxConfig, is_tf_available
|
from transformers import XxxConfig, is_tf_available
|
||||||
|
|
||||||
@@ -33,10 +33,9 @@ if is_tf_available():
|
|||||||
TFXxxForTokenClassification,
|
TFXxxForTokenClassification,
|
||||||
TFXxxForQuestionAnswering,
|
TFXxxForQuestionAnswering,
|
||||||
TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
|
all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
|
||||||
@@ -244,7 +243,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in ['xxx-base-uncased']:
|
for model_name in ['xxx-base-uncased']:
|
||||||
|
|||||||
@@ -18,12 +18,12 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
|
from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
|
||||||
@@ -31,10 +31,9 @@ if is_torch_available():
|
|||||||
XxxForQuestionAnswering, XxxForSequenceClassification,
|
XxxForQuestionAnswering, XxxForSequenceClassification,
|
||||||
XxxForTokenClassification, XxxForMultipleChoice)
|
XxxForTokenClassification, XxxForMultipleChoice)
|
||||||
from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
|
from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class XxxModelTest(CommonTestCases.CommonModelTester):
|
class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
|
all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
|
||||||
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = XxxModel(config=config)
|
model = XxxModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = XxxForMaskedLM(config=config)
|
model = XxxForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = XxxForQuestionAnswering(config=config)
|
model = XxxForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = XxxForSequenceClassification(config)
|
model = XxxForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = XxxForTokenClassification(config=config)
|
model = XxxForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
23
transformers-cli
Normal file
23
transformers-cli
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
from transformers.commands.user import UserCommands
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser(description='Transformers CLI tool', usage='transformers-cli <command> [<args>]')
|
||||||
|
commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
|
||||||
|
|
||||||
|
# Register commands
|
||||||
|
UserCommands.register_subcommand(commands_parser)
|
||||||
|
|
||||||
|
# Let's go
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not hasattr(args, 'func'):
|
||||||
|
parser.print_help()
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Run
|
||||||
|
service = args.func(args)
|
||||||
|
service.run()
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
__version__ = "2.1.1"
|
__version__ = "2.2.1"
|
||||||
|
|
||||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||||
# default Python logging output behavior when present.
|
# default Python logging output behavior when present.
|
||||||
@@ -26,11 +26,12 @@ from .data import (is_sklearn_available,
|
|||||||
InputExample, InputFeatures, DataProcessor,
|
InputExample, InputFeatures, DataProcessor,
|
||||||
glue_output_modes, glue_convert_examples_to_features,
|
glue_output_modes, glue_convert_examples_to_features,
|
||||||
glue_processors, glue_tasks_num_labels,
|
glue_processors, glue_tasks_num_labels,
|
||||||
|
xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
|
||||||
squad_convert_examples_to_features, SquadFeatures,
|
squad_convert_examples_to_features, SquadFeatures,
|
||||||
SquadExample, SquadV1Processor, SquadV2Processor)
|
SquadExample, SquadV1Processor, SquadV2Processor)
|
||||||
|
|
||||||
if is_sklearn_available():
|
if is_sklearn_available():
|
||||||
from .data import glue_compute_metrics
|
from .data import glue_compute_metrics, xnli_compute_metrics
|
||||||
|
|
||||||
# Tokenizers
|
# Tokenizers
|
||||||
from .tokenization_utils import (PreTrainedTokenizer)
|
from .tokenization_utils import (PreTrainedTokenizer)
|
||||||
@@ -44,6 +45,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
|||||||
from .tokenization_xlm import XLMTokenizer
|
from .tokenization_xlm import XLMTokenizer
|
||||||
from .tokenization_roberta import RobertaTokenizer
|
from .tokenization_roberta import RobertaTokenizer
|
||||||
from .tokenization_distilbert import DistilBertTokenizer
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
|
from .tokenization_albert import AlbertTokenizer
|
||||||
from .tokenization_camembert import CamembertTokenizer
|
from .tokenization_camembert import CamembertTokenizer
|
||||||
|
|
||||||
# Configurations
|
# Configurations
|
||||||
@@ -59,6 +61,7 @@ from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|||||||
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
# Modeling
|
# Modeling
|
||||||
@@ -85,9 +88,10 @@ if is_torch_available():
|
|||||||
CTRLLMHeadModel,
|
CTRLLMHeadModel,
|
||||||
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
||||||
XLNetForSequenceClassification, XLNetForMultipleChoice,
|
XLNetForSequenceClassification, XLNetForTokenClassification,
|
||||||
XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
|
XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
|
||||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
|
||||||
|
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
||||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||||
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
||||||
@@ -96,7 +100,7 @@ if is_torch_available():
|
|||||||
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
||||||
RobertaForTokenClassification,
|
RobertaForTokenClassification,
|
||||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
|
||||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||||
DistilBertForTokenClassification,
|
DistilBertForTokenClassification,
|
||||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
@@ -106,6 +110,10 @@ if is_torch_available():
|
|||||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||||
|
|
||||||
|
from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
|
||||||
|
AlbertForQuestionAnswering,
|
||||||
|
load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
|
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
|
||||||
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
|
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
|
||||||
@@ -113,7 +121,7 @@ if is_torch_available():
|
|||||||
|
|
||||||
# TensorFlow
|
# TensorFlow
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
|
||||||
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
||||||
TFAutoModelWithLMHead)
|
TFAutoModelWithLMHead)
|
||||||
|
|
||||||
@@ -139,6 +147,7 @@ if is_tf_available():
|
|||||||
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
||||||
TFXLNetModel, TFXLNetLMHeadModel,
|
TFXLNetModel, TFXLNetLMHeadModel,
|
||||||
TFXLNetForSequenceClassification,
|
TFXLNetForSequenceClassification,
|
||||||
|
TFXLNetForTokenClassification,
|
||||||
TFXLNetForQuestionAnsweringSimple,
|
TFXLNetForQuestionAnsweringSimple,
|
||||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
@@ -157,6 +166,7 @@ if is_tf_available():
|
|||||||
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
||||||
TFDistilBertModel, TFDistilBertForMaskedLM,
|
TFDistilBertModel, TFDistilBertForMaskedLM,
|
||||||
TFDistilBertForSequenceClassification,
|
TFDistilBertForSequenceClassification,
|
||||||
|
TFDistilBertForTokenClassification,
|
||||||
TFDistilBertForQuestionAnswering,
|
TFDistilBertForQuestionAnswering,
|
||||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
@@ -164,6 +174,12 @@ if is_tf_available():
|
|||||||
TFCTRLLMHeadModel,
|
TFCTRLLMHeadModel,
|
||||||
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
|
||||||
|
TFAlbertForSequenceClassification,
|
||||||
|
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
# Optimization
|
||||||
|
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
|
||||||
|
|
||||||
# TF 2.0 <=> PyTorch conversion utilities
|
# TF 2.0 <=> PyTorch conversion utilities
|
||||||
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
||||||
load_pytorch_checkpoint_in_tf2_model,
|
load_pytorch_checkpoint_in_tf2_model,
|
||||||
|
|||||||
12
transformers/commands/__init__.py
Normal file
12
transformers/commands/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
class BaseTransformersCLICommand(ABC):
|
||||||
|
@staticmethod
|
||||||
|
@abstractmethod
|
||||||
|
def register_subcommand(parser: ArgumentParser):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self):
|
||||||
|
raise NotImplementedError()
|
||||||
165
transformers/commands/user.py
Normal file
165
transformers/commands/user.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
from argparse import ArgumentParser
|
||||||
|
from getpass import getpass
|
||||||
|
import os
|
||||||
|
|
||||||
|
from transformers.commands import BaseTransformersCLICommand
|
||||||
|
from transformers.hf_api import HfApi, HfFolder, HTTPError
|
||||||
|
|
||||||
|
|
||||||
|
class UserCommands(BaseTransformersCLICommand):
|
||||||
|
@staticmethod
|
||||||
|
def register_subcommand(parser: ArgumentParser):
|
||||||
|
login_parser = parser.add_parser('login')
|
||||||
|
login_parser.set_defaults(func=lambda args: LoginCommand(args))
|
||||||
|
whoami_parser = parser.add_parser('whoami')
|
||||||
|
whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
|
||||||
|
logout_parser = parser.add_parser('logout')
|
||||||
|
logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
|
||||||
|
list_parser = parser.add_parser('ls')
|
||||||
|
list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
||||||
|
# upload
|
||||||
|
upload_parser = parser.add_parser('upload')
|
||||||
|
upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
|
||||||
|
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
|
||||||
|
upload_parser.set_defaults(func=lambda args: UploadCommand(args))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ANSI:
|
||||||
|
"""
|
||||||
|
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
||||||
|
"""
|
||||||
|
_bold = u"\u001b[1m"
|
||||||
|
_reset = u"\u001b[0m"
|
||||||
|
@classmethod
|
||||||
|
def bold(cls, s):
|
||||||
|
return "{}{}{}".format(cls._bold, s, cls._reset)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseUserCommand:
|
||||||
|
def __init__(self, args):
|
||||||
|
self.args = args
|
||||||
|
self._api = HfApi()
|
||||||
|
|
||||||
|
|
||||||
|
class LoginCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
print("""
|
||||||
|
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||||
|
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||||
|
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||||
|
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||||
|
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||||
|
|
||||||
|
""")
|
||||||
|
username = input("Username: ")
|
||||||
|
password = getpass()
|
||||||
|
try:
|
||||||
|
token = self._api.login(username, password)
|
||||||
|
except HTTPError as e:
|
||||||
|
# probably invalid credentials, display error message.
|
||||||
|
print(e)
|
||||||
|
exit(1)
|
||||||
|
HfFolder.save_token(token)
|
||||||
|
print("Login successful")
|
||||||
|
print("Your token:", token, "\n")
|
||||||
|
print("Your token has been saved to", HfFolder.path_token)
|
||||||
|
|
||||||
|
|
||||||
|
class WhoamiCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit()
|
||||||
|
try:
|
||||||
|
user = self._api.whoami(token)
|
||||||
|
print(user)
|
||||||
|
except HTTPError as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
class LogoutCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit()
|
||||||
|
HfFolder.delete_token()
|
||||||
|
self._api.logout(token)
|
||||||
|
print("Successfully logged out.")
|
||||||
|
|
||||||
|
|
||||||
|
class ListObjsCommand(BaseUserCommand):
|
||||||
|
def tabulate(self, rows, headers):
|
||||||
|
# type: (List[List[Union[str, int]]], List[str]) -> str
|
||||||
|
"""
|
||||||
|
Inspired by:
|
||||||
|
stackoverflow.com/a/8356620/593036
|
||||||
|
stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
|
||||||
|
"""
|
||||||
|
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
|
||||||
|
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
|
||||||
|
lines = []
|
||||||
|
lines.append(
|
||||||
|
row_format.format(*headers)
|
||||||
|
)
|
||||||
|
lines.append(
|
||||||
|
row_format.format(*["-" * w for w in col_widths])
|
||||||
|
)
|
||||||
|
for row in rows:
|
||||||
|
lines.append(
|
||||||
|
row_format.format(*row)
|
||||||
|
)
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit(1)
|
||||||
|
try:
|
||||||
|
objs = self._api.list_objs(token)
|
||||||
|
except HTTPError as e:
|
||||||
|
print(e)
|
||||||
|
exit(1)
|
||||||
|
if len(objs) == 0:
|
||||||
|
print("No shared file yet")
|
||||||
|
exit()
|
||||||
|
rows = [ [
|
||||||
|
obj.filename,
|
||||||
|
obj.LastModified,
|
||||||
|
obj.ETag,
|
||||||
|
obj.Size
|
||||||
|
] for obj in objs ]
|
||||||
|
print(
|
||||||
|
self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class UploadCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit(1)
|
||||||
|
filepath = os.path.join(os.getcwd(), self.args.file)
|
||||||
|
filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
|
||||||
|
print(
|
||||||
|
"About to upload file {} to S3 under filename {}".format(
|
||||||
|
ANSI.bold(filepath), ANSI.bold(filename)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
choice = input("Proceed? [Y/n] ").lower()
|
||||||
|
if not(choice == "" or choice == "y" or choice == "yes"):
|
||||||
|
print("Abort")
|
||||||
|
exit()
|
||||||
|
print(
|
||||||
|
ANSI.bold("Uploading... This might take a while if file is large")
|
||||||
|
)
|
||||||
|
access_url = self._api.presign_and_upload(
|
||||||
|
token=token, filename=filename, filepath=filepath
|
||||||
|
)
|
||||||
|
print("Your file now lives at:")
|
||||||
|
print(access_url)
|
||||||
100
transformers/configuration_albert.py
Normal file
100
transformers/configuration_albert.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" ALBERT model configuration """
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
|
||||||
|
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
|
||||||
|
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
|
||||||
|
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
|
||||||
|
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
|
||||||
|
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
|
||||||
|
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
|
||||||
|
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
class AlbertConfig(PretrainedConfig):
|
||||||
|
"""Configuration for `AlbertModel`.
|
||||||
|
|
||||||
|
The default settings match the configuration of model `albert_xxlarge`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size_or_config_json_file=30000,
|
||||||
|
embedding_size=128,
|
||||||
|
hidden_size=4096,
|
||||||
|
num_hidden_layers=12,
|
||||||
|
num_hidden_groups=1,
|
||||||
|
num_attention_heads=64,
|
||||||
|
intermediate_size=16384,
|
||||||
|
inner_group_num=1,
|
||||||
|
hidden_act="gelu_new",
|
||||||
|
hidden_dropout_prob=0,
|
||||||
|
attention_probs_dropout_prob=0,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
layer_norm_eps=1e-12, **kwargs):
|
||||||
|
"""Constructs AlbertConfig.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
|
||||||
|
embedding_size: size of voc embeddings.
|
||||||
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
|
num_hidden_groups: Number of group for the hidden layers, parameters in
|
||||||
|
the same group are shared.
|
||||||
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||||
|
layer in the Transformer encoder.
|
||||||
|
inner_group_num: int, number of inner repetition of attention and ffn.
|
||||||
|
down_scale_factor: float, the scale to apply
|
||||||
|
hidden_act: The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler.
|
||||||
|
hidden_dropout_prob: The dropout probability for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
max_position_embeddings: The maximum sequence length that this model might
|
||||||
|
ever be used with. Typically set this to something large just in case
|
||||||
|
(e.g., 512 or 1024 or 2048).
|
||||||
|
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||||
|
`AlbertModel`.
|
||||||
|
initializer_range: The stdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
"""
|
||||||
|
super(AlbertConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
|
self.embedding_size = embedding_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_hidden_groups = num_hidden_groups
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.inner_group_num = inner_group_num
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.type_vocab_size = type_vocab_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
@@ -28,6 +28,7 @@ from .configuration_roberta import RobertaConfig
|
|||||||
from .configuration_distilbert import DistilBertConfig
|
from .configuration_distilbert import DistilBertConfig
|
||||||
from .configuration_ctrl import CTRLConfig
|
from .configuration_ctrl import CTRLConfig
|
||||||
from .configuration_camembert import CamembertConfig
|
from .configuration_camembert import CamembertConfig
|
||||||
|
from .configuration_albert import AlbertConfig
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -44,14 +45,15 @@ class AutoConfig(object):
|
|||||||
The base model class to instantiate is selected as the first pattern matching
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertConfig (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||||
|
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||||
- contains `bert`: BertConfig (Bert model)
|
- contains `bert`: BertConfig (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
- contains `xlm`: XLMConfig (XLM model)
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
|
||||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
|
||||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||||
This class cannot be instantiated using `__init__()` (throw an error).
|
This class cannot be instantiated using `__init__()` (throw an error).
|
||||||
"""
|
"""
|
||||||
@@ -67,14 +69,15 @@ class AutoConfig(object):
|
|||||||
The configuration class to instantiate is selected as the first pattern matching
|
The configuration class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertConfig (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||||
|
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||||
- contains `bert`: BertConfig (Bert model)
|
- contains `bert`: BertConfig (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
- contains `xlm`: XLMConfig (XLM model)
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
|
||||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
|
||||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||||
Params:
|
Params:
|
||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
@@ -95,6 +98,9 @@ class AutoConfig(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -119,6 +125,8 @@ class AutoConfig(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
elif 'camembert' in pretrained_model_name_or_path:
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
@@ -139,4 +147,4 @@ class AutoConfig(object):
|
|||||||
return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
"'xlm', 'roberta', 'camembert', 'ctrl'".format(pretrained_model_name_or_path))
|
"'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|||||||
@@ -27,7 +27,9 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
|
||||||
|
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
|
||||||
|
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -94,6 +94,9 @@ class PretrainedConfig(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -120,6 +123,7 @@ class PretrainedConfig(object):
|
|||||||
"""
|
"""
|
||||||
cache_dir = kwargs.pop('cache_dir', None)
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
force_download = kwargs.pop('force_download', False)
|
force_download = kwargs.pop('force_download', False)
|
||||||
|
resume_download = kwargs.pop('resume_download', False)
|
||||||
proxies = kwargs.pop('proxies', None)
|
proxies = kwargs.pop('proxies', None)
|
||||||
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
||||||
|
|
||||||
@@ -131,7 +135,8 @@ class PretrainedConfig(object):
|
|||||||
config_file = pretrained_model_name_or_path
|
config_file = pretrained_model_name_or_path
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
|
||||||
|
proxies=proxies, resume_download=resume_download)
|
||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||||
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
||||||
|
|||||||
@@ -0,0 +1,67 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Convert ALBERT checkpoint."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
|
||||||
|
# Initialise PyTorch model
|
||||||
|
config = AlbertConfig.from_json_file(albert_config_file)
|
||||||
|
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||||
|
model = AlbertForMaskedLM(config)
|
||||||
|
|
||||||
|
# Load weights from tf checkpoint
|
||||||
|
load_tf_weights_in_albert(model, config, tf_checkpoint_path)
|
||||||
|
|
||||||
|
# Save pytorch-model
|
||||||
|
print("Save PyTorch model to {}".format(pytorch_dump_path))
|
||||||
|
torch.save(model.state_dict(), pytorch_dump_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--tf_checkpoint_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the TensorFlow checkpoint path.")
|
||||||
|
parser.add_argument("--albert_config_file",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "The config json file corresponding to the pre-trained ALBERT model. \n"
|
||||||
|
"This specifies the model architecture.")
|
||||||
|
parser.add_argument("--pytorch_dump_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the output PyTorch model.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
|
||||||
|
args.albert_config_file,
|
||||||
|
args.pytorch_dump_path)
|
||||||
|
|
||||||
@@ -33,7 +33,8 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
|
|||||||
OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
@@ -46,7 +47,8 @@ if is_torch_available():
|
|||||||
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
else:
|
||||||
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
@@ -56,7 +58,8 @@ else:
|
|||||||
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
||||||
None, None, None, None,
|
None, None, None, None,
|
||||||
None, None,
|
None, None,
|
||||||
None, None,
|
None, None,
|
||||||
@@ -65,6 +68,7 @@ else:
|
|||||||
None, None,
|
None, None,
|
||||||
None, None, None,
|
None, None, None,
|
||||||
None, None, None,
|
None, None, None,
|
||||||
|
None, None,
|
||||||
None, None)
|
None, None)
|
||||||
|
|
||||||
|
|
||||||
@@ -85,7 +89,8 @@ MODEL_CLASSES = {
|
|||||||
'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||||
}
|
}
|
||||||
|
|
||||||
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
|
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
|
from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
|
||||||
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
|
from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
|
||||||
|
from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||||
|
|
||||||
from .metrics import is_sklearn_available
|
from .metrics import is_sklearn_available
|
||||||
if is_sklearn_available():
|
if is_sklearn_available():
|
||||||
from .metrics import glue_compute_metrics
|
from .metrics import glue_compute_metrics, xnli_compute_metrics
|
||||||
|
|||||||
@@ -81,3 +81,11 @@ if _has_sklearn:
|
|||||||
return {"acc": simple_accuracy(preds, labels)}
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
else:
|
else:
|
||||||
raise KeyError(task_name)
|
raise KeyError(task_name)
|
||||||
|
|
||||||
|
|
||||||
|
def xnli_compute_metrics(task_name, preds, labels):
|
||||||
|
assert len(preds) == len(labels)
|
||||||
|
if task_name == "xnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
else:
|
||||||
|
raise KeyError(task_name)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from .utils import InputExample, InputFeatures, DataProcessor
|
from .utils import InputExample, InputFeatures, DataProcessor
|
||||||
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
|
from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
|
||||||
|
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||||
85
transformers/data/processors/xnli.py
Normal file
85
transformers/data/processors/xnli.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" XNLI utils (dataset loading and evaluation) """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from .utils import DataProcessor, InputExample
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class XnliProcessor(DataProcessor):
|
||||||
|
"""Processor for the XNLI dataset.
|
||||||
|
Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
|
||||||
|
|
||||||
|
def __init__(self, language, train_language = None):
|
||||||
|
self.language = language
|
||||||
|
self.train_language = train_language
|
||||||
|
|
||||||
|
def get_train_examples(self, data_dir):
|
||||||
|
"""See base class."""
|
||||||
|
lg = self.language if self.train_language is None else self.train_language
|
||||||
|
lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
|
||||||
|
examples = []
|
||||||
|
for (i, line) in enumerate(lines):
|
||||||
|
if i == 0:
|
||||||
|
continue
|
||||||
|
guid = "%s-%s" % ('train', i)
|
||||||
|
text_a = line[0]
|
||||||
|
text_b = line[1]
|
||||||
|
label = "contradiction" if line[2] == "contradictory" else line[2]
|
||||||
|
assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
|
||||||
|
examples.append(
|
||||||
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def get_test_examples(self, data_dir):
|
||||||
|
"""See base class."""
|
||||||
|
lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
|
||||||
|
examples = []
|
||||||
|
for (i, line) in enumerate(lines):
|
||||||
|
if i == 0:
|
||||||
|
continue
|
||||||
|
language = line[0]
|
||||||
|
if language != self.language:
|
||||||
|
continue
|
||||||
|
guid = "%s-%s" % ('test', i)
|
||||||
|
text_a = line[6]
|
||||||
|
text_b = line[7]
|
||||||
|
label = line[1]
|
||||||
|
assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
|
||||||
|
examples.append(
|
||||||
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
"""See base class."""
|
||||||
|
return ["contradiction", "entailment", "neutral"]
|
||||||
|
|
||||||
|
xnli_processors = {
|
||||||
|
"xnli": XnliProcessor,
|
||||||
|
}
|
||||||
|
|
||||||
|
xnli_output_modes = {
|
||||||
|
"xnli": "classification",
|
||||||
|
}
|
||||||
|
|
||||||
|
xnli_tasks_num_labels = {
|
||||||
|
"xnli": 3,
|
||||||
|
}
|
||||||
@@ -22,6 +22,7 @@ from botocore.config import Config
|
|||||||
from botocore.exceptions import ClientError
|
from botocore.exceptions import ClientError
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
@@ -152,7 +153,7 @@ def filename_to_url(filename, cache_dir=None):
|
|||||||
return url, etag
|
return url, etag
|
||||||
|
|
||||||
|
|
||||||
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
|
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False):
|
||||||
"""
|
"""
|
||||||
Given something that might be a URL (or might be a local path),
|
Given something that might be a URL (or might be a local path),
|
||||||
determine which. If it's a URL, download the file and cache it, and
|
determine which. If it's a URL, download the file and cache it, and
|
||||||
@@ -161,6 +162,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
Args:
|
Args:
|
||||||
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
||||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
||||||
|
resume_download: if True, resume the download if incompletly recieved file is found.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
@@ -173,7 +175,9 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
|
|
||||||
if parsed.scheme in ('http', 'https', 's3'):
|
if parsed.scheme in ('http', 'https', 's3'):
|
||||||
# URL, so get it from the cache (downloading if necessary)
|
# URL, so get it from the cache (downloading if necessary)
|
||||||
return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
return get_from_cache(url_or_filename, cache_dir=cache_dir,
|
||||||
|
force_download=force_download, proxies=proxies,
|
||||||
|
resume_download=resume_download)
|
||||||
elif os.path.exists(url_or_filename):
|
elif os.path.exists(url_or_filename):
|
||||||
# File, and it exists.
|
# File, and it exists.
|
||||||
return url_or_filename
|
return url_or_filename
|
||||||
@@ -234,19 +238,22 @@ def s3_get(url, temp_file, proxies=None):
|
|||||||
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
||||||
|
|
||||||
|
|
||||||
def http_get(url, temp_file, proxies=None):
|
def http_get(url, temp_file, proxies=None, resume_size=0):
|
||||||
req = requests.get(url, stream=True, proxies=proxies)
|
headers={'Range':'bytes=%d-'%(resume_size,)} if resume_size > 0 else None
|
||||||
content_length = req.headers.get('Content-Length')
|
response = requests.get(url, stream=True, proxies=proxies, headers=headers)
|
||||||
total = int(content_length) if content_length is not None else None
|
if response.status_code == 416: # Range not satisfiable
|
||||||
progress = tqdm(unit="B", total=total)
|
return
|
||||||
for chunk in req.iter_content(chunk_size=1024):
|
content_length = response.headers.get('Content-Length')
|
||||||
|
total = resume_size + int(content_length) if content_length is not None else None
|
||||||
|
progress = tqdm(unit="B", total=total, initial=resume_size)
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
if chunk: # filter out keep-alive new chunks
|
if chunk: # filter out keep-alive new chunks
|
||||||
progress.update(len(chunk))
|
progress.update(len(chunk))
|
||||||
temp_file.write(chunk)
|
temp_file.write(chunk)
|
||||||
progress.close()
|
progress.close()
|
||||||
|
|
||||||
|
|
||||||
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10):
|
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False):
|
||||||
"""
|
"""
|
||||||
Given a URL, look for the corresponding dataset in the local cache.
|
Given a URL, look for the corresponding dataset in the local cache.
|
||||||
If it's not there, download it. Then return the path to the cached file.
|
If it's not there, download it. Then return the path to the cached file.
|
||||||
@@ -289,17 +296,35 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
|
|||||||
if matching_files:
|
if matching_files:
|
||||||
cache_path = os.path.join(cache_dir, matching_files[-1])
|
cache_path = os.path.join(cache_dir, matching_files[-1])
|
||||||
|
|
||||||
|
if resume_download:
|
||||||
|
incomplete_path = cache_path + '.incomplete'
|
||||||
|
@contextmanager
|
||||||
|
def _resumable_file_manager():
|
||||||
|
with open(incomplete_path,'a+b') as f:
|
||||||
|
yield f
|
||||||
|
os.remove(incomplete_path)
|
||||||
|
temp_file_manager = _resumable_file_manager
|
||||||
|
if os.path.exists(incomplete_path):
|
||||||
|
resume_size = os.stat(incomplete_path).st_size
|
||||||
|
else:
|
||||||
|
resume_size = 0
|
||||||
|
else:
|
||||||
|
temp_file_manager = tempfile.NamedTemporaryFile
|
||||||
|
resume_size = 0
|
||||||
|
|
||||||
if not os.path.exists(cache_path) or force_download:
|
if not os.path.exists(cache_path) or force_download:
|
||||||
# Download to temporary file, then copy to cache dir once finished.
|
# Download to temporary file, then copy to cache dir once finished.
|
||||||
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
||||||
with tempfile.NamedTemporaryFile() as temp_file:
|
with temp_file_manager() as temp_file:
|
||||||
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
|
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
|
||||||
|
|
||||||
# GET file object
|
# GET file object
|
||||||
if url.startswith("s3://"):
|
if url.startswith("s3://"):
|
||||||
|
if resume_download:
|
||||||
|
logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
|
||||||
s3_get(url, temp_file, proxies=proxies)
|
s3_get(url, temp_file, proxies=proxies)
|
||||||
else:
|
else:
|
||||||
http_get(url, temp_file, proxies=proxies)
|
http_get(url, temp_file, proxies=proxies, resume_size=resume_size)
|
||||||
|
|
||||||
# we are copying the file before closing it, so flush to avoid truncation
|
# we are copying the file before closing it, so flush to avoid truncation
|
||||||
temp_file.flush()
|
temp_file.flush()
|
||||||
|
|||||||
228
transformers/hf_api.py
Normal file
228
transformers/hf_api.py
Normal file
@@ -0,0 +1,228 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
from os.path import expanduser
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import six
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
ENDPOINT = "https://huggingface.co"
|
||||||
|
|
||||||
|
class S3Obj:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
filename, # type: str
|
||||||
|
LastModified, # type: str
|
||||||
|
ETag, # type: str
|
||||||
|
Size, # type: int
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
self.filename = filename
|
||||||
|
self.LastModified = LastModified
|
||||||
|
self.ETag = ETag
|
||||||
|
self.Size = Size
|
||||||
|
|
||||||
|
|
||||||
|
class PresignedUrl:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
write, # type: str
|
||||||
|
access, # type: str
|
||||||
|
type, # type: str
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
self.write = write
|
||||||
|
self.access = access
|
||||||
|
self.type = type # mime-type to send to S3.
|
||||||
|
|
||||||
|
|
||||||
|
class HfApi:
|
||||||
|
def __init__(self, endpoint=None):
|
||||||
|
self.endpoint = endpoint if endpoint is not None else ENDPOINT
|
||||||
|
|
||||||
|
def login(
|
||||||
|
self,
|
||||||
|
username, # type: str
|
||||||
|
password, # type: str
|
||||||
|
):
|
||||||
|
# type: (...) -> str
|
||||||
|
"""
|
||||||
|
Call HF API to sign in a user and get a token if credentials are valid.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
token if credentials are valid
|
||||||
|
|
||||||
|
Throws:
|
||||||
|
requests.exceptions.HTTPError if credentials are invalid
|
||||||
|
"""
|
||||||
|
path = "{}/api/login".format(self.endpoint)
|
||||||
|
r = requests.post(path, json={"username": username, "password": password})
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return d["token"]
|
||||||
|
|
||||||
|
def whoami(
|
||||||
|
self,
|
||||||
|
token, # type: str
|
||||||
|
):
|
||||||
|
# type: (...) -> str
|
||||||
|
"""
|
||||||
|
Call HF API to know "whoami"
|
||||||
|
"""
|
||||||
|
path = "{}/api/whoami".format(self.endpoint)
|
||||||
|
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return d["user"]
|
||||||
|
|
||||||
|
def logout(self, token):
|
||||||
|
# type: (...) -> void
|
||||||
|
"""
|
||||||
|
Call HF API to log out.
|
||||||
|
"""
|
||||||
|
path = "{}/api/logout".format(self.endpoint)
|
||||||
|
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
def presign(self, token, filename):
|
||||||
|
# type: (...) -> PresignedUrl
|
||||||
|
"""
|
||||||
|
Call HF API to get a presigned url to upload `filename` to S3.
|
||||||
|
"""
|
||||||
|
path = "{}/api/presign".format(self.endpoint)
|
||||||
|
r = requests.post(
|
||||||
|
path,
|
||||||
|
headers={"authorization": "Bearer {}".format(token)},
|
||||||
|
json={"filename": filename},
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return PresignedUrl(**d)
|
||||||
|
|
||||||
|
def presign_and_upload(self, token, filename, filepath):
|
||||||
|
# type: (...) -> str
|
||||||
|
"""
|
||||||
|
Get a presigned url, then upload file to S3.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
url: Read-only url for the stored file on S3.
|
||||||
|
"""
|
||||||
|
urls = self.presign(token, filename=filename)
|
||||||
|
# streaming upload:
|
||||||
|
# https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
|
||||||
|
#
|
||||||
|
# Even though we presign with the correct content-type,
|
||||||
|
# the client still has to specify it when uploading the file.
|
||||||
|
with open(filepath, "rb") as f:
|
||||||
|
pf = TqdmProgressFileReader(f)
|
||||||
|
|
||||||
|
r = requests.put(urls.write, data=f, headers={
|
||||||
|
"content-type": urls.type,
|
||||||
|
})
|
||||||
|
r.raise_for_status()
|
||||||
|
pf.close()
|
||||||
|
return urls.access
|
||||||
|
|
||||||
|
def list_objs(self, token):
|
||||||
|
# type: (...) -> List[S3Obj]
|
||||||
|
"""
|
||||||
|
Call HF API to list all stored files for user.
|
||||||
|
"""
|
||||||
|
path = "{}/api/listObjs".format(self.endpoint)
|
||||||
|
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return [S3Obj(**x) for x in d]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TqdmProgressFileReader:
|
||||||
|
"""
|
||||||
|
Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
|
||||||
|
and override `f.read()` so as to display a tqdm progress bar.
|
||||||
|
|
||||||
|
see github.com/huggingface/transformers/pull/2078#discussion_r354739608
|
||||||
|
for implementation details.
|
||||||
|
"""
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
f # type: io.BufferedReader
|
||||||
|
):
|
||||||
|
self.f = f
|
||||||
|
self.total_size = os.fstat(f.fileno()).st_size # type: int
|
||||||
|
self.pbar = tqdm(total=self.total_size, leave=False)
|
||||||
|
if six.PY3:
|
||||||
|
# does not work unless PY3
|
||||||
|
# no big deal as the CLI does not currently support PY2 anyways.
|
||||||
|
self.read = f.read
|
||||||
|
f.read = self._read
|
||||||
|
|
||||||
|
def _read(self, n=-1):
|
||||||
|
self.pbar.update(n)
|
||||||
|
return self.read(n)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.pbar.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class HfFolder:
|
||||||
|
path_token = expanduser("~/.huggingface/token")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def save_token(cls, token):
|
||||||
|
"""
|
||||||
|
Save token, creating folder as needed.
|
||||||
|
"""
|
||||||
|
if six.PY3:
|
||||||
|
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
||||||
|
else:
|
||||||
|
# Python 2
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(cls.path_token))
|
||||||
|
except OSError as e:
|
||||||
|
if e.errno != os.errno.EEXIST:
|
||||||
|
raise e
|
||||||
|
pass
|
||||||
|
with open(cls.path_token, 'w+') as f:
|
||||||
|
f.write(token)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_token(cls):
|
||||||
|
"""
|
||||||
|
Get token or None if not existent.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(cls.path_token, 'r') as f:
|
||||||
|
return f.read()
|
||||||
|
except:
|
||||||
|
# this is too wide. When Py2 is dead use:
|
||||||
|
# `except FileNotFoundError:` instead
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def delete_token(cls):
|
||||||
|
"""
|
||||||
|
Delete token.
|
||||||
|
Do not fail if token does not exist.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
os.remove(cls.path_token)
|
||||||
|
except:
|
||||||
|
return
|
||||||
801
transformers/modeling_albert.py
Normal file
801
transformers/modeling_albert.py
Normal file
@@ -0,0 +1,801 @@
|
|||||||
|
|
||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""PyTorch ALBERT model. """
|
||||||
|
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
import logging
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
from transformers.modeling_utils import PreTrainedModel
|
||||||
|
from transformers.configuration_albert import AlbertConfig
|
||||||
|
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention, prune_linear_layer, ACT2FN
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
|
||||||
|
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
|
||||||
|
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
|
||||||
|
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
|
||||||
|
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
|
||||||
|
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
|
||||||
|
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
|
||||||
|
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
|
||||||
|
""" Load tf checkpoints in a pytorch model."""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
except ImportError:
|
||||||
|
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
|
||||||
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
|
raise
|
||||||
|
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||||
|
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||||
|
# Load weights from TF model
|
||||||
|
init_vars = tf.train.list_variables(tf_path)
|
||||||
|
names = []
|
||||||
|
arrays = []
|
||||||
|
for name, shape in init_vars:
|
||||||
|
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||||
|
array = tf.train.load_variable(tf_path, name)
|
||||||
|
names.append(name)
|
||||||
|
arrays.append(array)
|
||||||
|
|
||||||
|
for name, array in zip(names, arrays):
|
||||||
|
print(name)
|
||||||
|
|
||||||
|
for name, array in zip(names, arrays):
|
||||||
|
original_name = name
|
||||||
|
|
||||||
|
# If saved from the TF HUB module
|
||||||
|
name = name.replace("module/", "")
|
||||||
|
|
||||||
|
# Renaming and simplifying
|
||||||
|
name = name.replace("ffn_1", "ffn")
|
||||||
|
name = name.replace("bert/", "albert/")
|
||||||
|
name = name.replace("attention_1", "attention")
|
||||||
|
name = name.replace("transform/", "")
|
||||||
|
name = name.replace("LayerNorm_1", "full_layer_layer_norm")
|
||||||
|
name = name.replace("LayerNorm", "attention/LayerNorm")
|
||||||
|
name = name.replace("transformer/", "")
|
||||||
|
|
||||||
|
# The feed forward layer had an 'intermediate' step which has been abstracted away
|
||||||
|
name = name.replace("intermediate/dense/", "")
|
||||||
|
name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
|
||||||
|
|
||||||
|
# ALBERT attention was split between self and output which have been abstracted away
|
||||||
|
name = name.replace("/output/", "/")
|
||||||
|
name = name.replace("/self/", "/")
|
||||||
|
|
||||||
|
# The pooler is a linear layer
|
||||||
|
name = name.replace("pooler/dense", "pooler")
|
||||||
|
|
||||||
|
# The classifier was simplified to predictions from cls/predictions
|
||||||
|
name = name.replace("cls/predictions", "predictions")
|
||||||
|
name = name.replace("predictions/attention", "predictions")
|
||||||
|
|
||||||
|
# Naming was changed to be more explicit
|
||||||
|
name = name.replace("embeddings/attention", "embeddings")
|
||||||
|
name = name.replace("inner_group_", "albert_layers/")
|
||||||
|
name = name.replace("group_", "albert_layer_groups/")
|
||||||
|
|
||||||
|
# Classifier
|
||||||
|
if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
|
||||||
|
name = "classifier/" + name
|
||||||
|
|
||||||
|
# No ALBERT model currently handles the next sentence prediction task
|
||||||
|
if "seq_relationship" in name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = name.split('/')
|
||||||
|
|
||||||
|
# Ignore the gradients applied by the LAMB/ADAM optimizers.
|
||||||
|
if "adam_m" in name or "adam_v" in name or "global_step" in name:
|
||||||
|
logger.info("Skipping {}".format("/".join(name)))
|
||||||
|
continue
|
||||||
|
|
||||||
|
pointer = model
|
||||||
|
for m_name in name:
|
||||||
|
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
|
||||||
|
l = re.split(r'_(\d+)', m_name)
|
||||||
|
else:
|
||||||
|
l = [m_name]
|
||||||
|
|
||||||
|
if l[0] == 'kernel' or l[0] == 'gamma':
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
elif l[0] == 'output_bias' or l[0] == 'beta':
|
||||||
|
pointer = getattr(pointer, 'bias')
|
||||||
|
elif l[0] == 'output_weights':
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
elif l[0] == 'squad':
|
||||||
|
pointer = getattr(pointer, 'classifier')
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
pointer = getattr(pointer, l[0])
|
||||||
|
except AttributeError:
|
||||||
|
logger.info("Skipping {}".format("/".join(name)))
|
||||||
|
continue
|
||||||
|
if len(l) >= 2:
|
||||||
|
num = int(l[1])
|
||||||
|
pointer = pointer[num]
|
||||||
|
|
||||||
|
if m_name[-11:] == '_embeddings':
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
elif m_name == 'kernel':
|
||||||
|
array = np.transpose(array)
|
||||||
|
try:
|
||||||
|
assert pointer.shape == array.shape
|
||||||
|
except AssertionError as e:
|
||||||
|
e.args += (pointer.shape, array.shape)
|
||||||
|
raise
|
||||||
|
print("Initialize PyTorch weight {} from {}".format(name, original_name))
|
||||||
|
pointer.data = torch.from_numpy(array)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertEmbeddings(BertEmbeddings):
|
||||||
|
"""
|
||||||
|
Construct the embeddings from word, position and token_type embeddings.
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertEmbeddings, self).__init__(config)
|
||||||
|
|
||||||
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
|
||||||
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
|
||||||
|
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
|
||||||
|
self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertAttention(BertSelfAttention):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertAttention, self).__init__(config)
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.num_attention_heads = config.num_attention_heads
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.attention_head_size = config.hidden_size // config.num_attention_heads
|
||||||
|
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||||
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
|
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
if len(heads) == 0:
|
||||||
|
return
|
||||||
|
mask = torch.ones(self.num_attention_heads, self.attention_head_size)
|
||||||
|
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||||
|
for head in heads:
|
||||||
|
# Compute how many pruned heads are before the head and move the index accordingly
|
||||||
|
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
|
mask[head] = 0
|
||||||
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
|
index = torch.arange(len(mask))[mask].long()
|
||||||
|
|
||||||
|
# Prune linear layers
|
||||||
|
self.query = prune_linear_layer(self.query, index)
|
||||||
|
self.key = prune_linear_layer(self.key, index)
|
||||||
|
self.value = prune_linear_layer(self.value, index)
|
||||||
|
self.dense = prune_linear_layer(self.dense, index, dim=1)
|
||||||
|
|
||||||
|
# Update hyper params and store pruned heads
|
||||||
|
self.num_attention_heads = self.num_attention_heads - len(heads)
|
||||||
|
self.all_head_size = self.attention_head_size * self.num_attention_heads
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
|
def forward(self, input_ids, attention_mask=None, head_mask=None):
|
||||||
|
mixed_query_layer = self.query(input_ids)
|
||||||
|
mixed_key_layer = self.key(input_ids)
|
||||||
|
mixed_value_layer = self.value(input_ids)
|
||||||
|
|
||||||
|
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||||
|
key_layer = self.transpose_for_scores(mixed_key_layer)
|
||||||
|
value_layer = self.transpose_for_scores(mixed_value_layer)
|
||||||
|
|
||||||
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
|
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||||
|
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
|
# Normalize the attention scores to probabilities.
|
||||||
|
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||||
|
|
||||||
|
# This is actually dropping out entire tokens to attend to, which might
|
||||||
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||||
|
attention_probs = self.dropout(attention_probs)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
attention_probs = attention_probs * head_mask
|
||||||
|
|
||||||
|
context_layer = torch.matmul(attention_probs, value_layer)
|
||||||
|
|
||||||
|
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||||
|
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
||||||
|
reshaped_context_layer = context_layer.view(*new_context_layer_shape)
|
||||||
|
|
||||||
|
|
||||||
|
# Should find a better way to do this
|
||||||
|
w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
|
||||||
|
b = self.dense.bias.to(context_layer.dtype)
|
||||||
|
|
||||||
|
projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
|
||||||
|
projected_context_layer_dropout = self.dropout(projected_context_layer)
|
||||||
|
layernormed_context_layer = self.LayerNorm(input_ids + projected_context_layer_dropout)
|
||||||
|
return (layernormed_context_layer, attention_probs) if self.output_attentions else (layernormed_context_layer,)
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertLayer(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertLayer, self).__init__()
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
|
self.attention = AlbertAttention(config)
|
||||||
|
self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||||
|
self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
|
attention_output = self.attention(hidden_states, attention_mask, head_mask)
|
||||||
|
ffn_output = self.ffn(attention_output[0])
|
||||||
|
ffn_output = self.activation(ffn_output)
|
||||||
|
ffn_output = self.ffn_output(ffn_output)
|
||||||
|
hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
|
||||||
|
|
||||||
|
return (hidden_states,) + attention_output[1:] # add attentions if we output them
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertLayerGroup(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertLayerGroup, self).__init__()
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
|
layer_hidden_states = ()
|
||||||
|
layer_attentions = ()
|
||||||
|
|
||||||
|
for layer_index, albert_layer in enumerate(self.albert_layers):
|
||||||
|
layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index])
|
||||||
|
hidden_states = layer_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
layer_attentions = layer_attentions + (layer_output[1],)
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
layer_hidden_states = layer_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (layer_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (layer_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (layer hidden states), (layer attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertTransformer(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertTransformer, self).__init__()
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
|
||||||
|
self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
|
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
|
||||||
|
|
||||||
|
all_attentions = ()
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = (hidden_states,)
|
||||||
|
|
||||||
|
for i in range(self.config.num_hidden_layers):
|
||||||
|
# Number of layers in a hidden group
|
||||||
|
layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
|
||||||
|
|
||||||
|
# Index of the hidden group
|
||||||
|
group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
|
||||||
|
|
||||||
|
# Index of the layer inside the group
|
||||||
|
layer_idx = int(i - group_idx * layers_per_group)
|
||||||
|
|
||||||
|
layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])
|
||||||
|
hidden_states = layer_group_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions = all_attentions + layer_group_output[-1]
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertPreTrainedModel(PreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = AlbertConfig
|
||||||
|
pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
base_model_prefix = "albert"
|
||||||
|
|
||||||
|
def _init_weights(self, module):
|
||||||
|
""" Initialize the weights.
|
||||||
|
"""
|
||||||
|
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||||
|
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||||
|
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||||
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
|
if isinstance(module, (nn.Linear)) and module.bias is not None:
|
||||||
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.bias.data.zero_()
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
|
ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in
|
||||||
|
`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
|
||||||
|
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
|
||||||
|
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
|
||||||
|
|
||||||
|
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
|
||||||
|
refer to the PyTorch documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
|
||||||
|
https://arxiv.org/abs/1909.11942
|
||||||
|
|
||||||
|
.. _`torch.nn.Module`:
|
||||||
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ALBERT_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
|
Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
|
||||||
|
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertModel(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
|
||||||
|
Last layer hidden-state of the first token of the sequence (classification token)
|
||||||
|
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||||
|
layer weights are trained from the next sentence prediction (classification)
|
||||||
|
objective during Bert pretraining. This output is usually *not* a good summary
|
||||||
|
of the semantic content of the input, you're often better with averaging or pooling
|
||||||
|
the sequence of hidden-states for the whole input sequence.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
config_class = AlbertConfig
|
||||||
|
pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_tf_weights = load_tf_weights_in_albert
|
||||||
|
base_model_prefix = "albert"
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertModel, self).__init__(config)
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.embeddings = AlbertEmbeddings(config)
|
||||||
|
self.encoder = AlbertTransformer(config)
|
||||||
|
self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
|
self.pooler_activation = nn.Tanh()
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.embeddings.word_embeddings
|
||||||
|
|
||||||
|
def set_input_embeddings(self, value):
|
||||||
|
self.embeddings.word_embeddings = value
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
old_embeddings = self.embeddings.word_embeddings
|
||||||
|
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||||
|
self.embeddings.word_embeddings = new_embeddings
|
||||||
|
return self.embeddings.word_embeddings
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
""" Prunes heads of the model.
|
||||||
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
|
ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
|
||||||
|
If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
|
||||||
|
is a total of 4 different layers.
|
||||||
|
|
||||||
|
These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
|
||||||
|
while [2,3] correspond to the two inner groups of the second hidden layer.
|
||||||
|
|
||||||
|
Any layer with in index other than [0,1,2,3] will result in an error.
|
||||||
|
See base class PreTrainedModel for more information about head pruning
|
||||||
|
"""
|
||||||
|
for layer, heads in heads_to_prune.items():
|
||||||
|
group_idx = int(layer / self.config.inner_group_num)
|
||||||
|
inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
|
||||||
|
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
|
inputs_embeds=None):
|
||||||
|
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = torch.ones(input_shape, device=device)
|
||||||
|
if token_type_ids is None:
|
||||||
|
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||||
|
|
||||||
|
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||||
|
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||||
|
if head_mask is not None:
|
||||||
|
if head_mask.dim() == 1:
|
||||||
|
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||||
|
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
|
||||||
|
elif head_mask.dim() == 2:
|
||||||
|
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
|
||||||
|
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.config.num_hidden_layers
|
||||||
|
|
||||||
|
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
encoder_outputs = self.encoder(embedding_output,
|
||||||
|
extended_attention_mask,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
|
sequence_output = encoder_outputs[0]
|
||||||
|
|
||||||
|
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
|
||||||
|
|
||||||
|
outputs = (sequence_output, pooled_output) + encoder_outputs[1:] # add hidden_states and attentions if they are here
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
class AlbertMLMHead(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertMLMHead, self).__init__()
|
||||||
|
|
||||||
|
self.LayerNorm = nn.LayerNorm(config.embedding_size)
|
||||||
|
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||||
|
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
|
||||||
|
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.activation(hidden_states)
|
||||||
|
hidden_states = self.LayerNorm(hidden_states)
|
||||||
|
hidden_states = self.decoder(hidden_states)
|
||||||
|
|
||||||
|
prediction_scores = hidden_states + self.bias
|
||||||
|
|
||||||
|
return prediction_scores
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the masked language modeling loss.
|
||||||
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
|
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Masked language modeling loss.
|
||||||
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertForMaskedLM, self).__init__(config)
|
||||||
|
|
||||||
|
self.albert = AlbertModel(config)
|
||||||
|
self.predictions = AlbertMLMHead(config)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
|
def tie_weights(self):
|
||||||
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
|
"""
|
||||||
|
self._tie_or_clone_weights(self.predictions.decoder,
|
||||||
|
self.albert.embeddings.word_embeddings)
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.predictions.decoder
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
|
masked_lm_labels=None):
|
||||||
|
outputs = self.albert(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
sequence_outputs = outputs[0]
|
||||||
|
|
||||||
|
prediction_scores = self.predictions(sequence_outputs)
|
||||||
|
|
||||||
|
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||||
|
if masked_lm_labels is not None:
|
||||||
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
|
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||||
|
outputs = (masked_lm_loss,) + outputs
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||||
|
the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for computing the sequence classification/regression loss.
|
||||||
|
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||||
|
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||||
|
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Classification (or regression if config.num_labels==1) loss.
|
||||||
|
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||||
|
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, labels=labels)
|
||||||
|
loss, logits = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertForSequenceClassification, self).__init__(config)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.albert = AlbertModel(config)
|
||||||
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
|
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
|
outputs = self.albert(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
|
||||||
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
|
pooled_output = self.dropout(pooled_output)
|
||||||
|
logits = self.classifier(pooled_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
|
||||||
|
if labels is not None:
|
||||||
|
if self.num_labels == 1:
|
||||||
|
# We are doing regression
|
||||||
|
loss_fct = MSELoss()
|
||||||
|
loss = loss_fct(logits.view(-1), labels.view(-1))
|
||||||
|
else:
|
||||||
|
loss_fct = CrossEntropyLoss()
|
||||||
|
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||||
|
outputs = (loss,) + outputs
|
||||||
|
|
||||||
|
return outputs # (loss), logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||||
|
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||||
|
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||||
|
Position outside of the sequence are not taken into account for computing the loss.
|
||||||
|
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||||
|
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||||
|
Position outside of the sequence are not taken into account for computing the loss.
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||||
|
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-start scores (before SoftMax).
|
||||||
|
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-end scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
|
||||||
|
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||||
|
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
|
||||||
|
input_ids = tokenizer.encode(input_text)
|
||||||
|
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||||
|
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
|
||||||
|
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||||
|
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
|
||||||
|
# a nice puppet
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertForQuestionAnswering, self).__init__(config)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.albert = AlbertModel(config)
|
||||||
|
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
|
inputs_embeds=None, start_positions=None, end_positions=None):
|
||||||
|
|
||||||
|
outputs = self.albert(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
logits = self.qa_outputs(sequence_output)
|
||||||
|
start_logits, end_logits = logits.split(1, dim=-1)
|
||||||
|
start_logits = start_logits.squeeze(-1)
|
||||||
|
end_logits = end_logits.squeeze(-1)
|
||||||
|
|
||||||
|
outputs = (start_logits, end_logits,) + outputs[2:]
|
||||||
|
if start_positions is not None and end_positions is not None:
|
||||||
|
# If we are on multi-GPU, split add a dimension
|
||||||
|
if len(start_positions.size()) > 1:
|
||||||
|
start_positions = start_positions.squeeze(-1)
|
||||||
|
if len(end_positions.size()) > 1:
|
||||||
|
end_positions = end_positions.squeeze(-1)
|
||||||
|
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
||||||
|
ignored_index = start_logits.size(1)
|
||||||
|
start_positions.clamp_(0, ignored_index)
|
||||||
|
end_positions.clamp_(0, ignored_index)
|
||||||
|
|
||||||
|
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
|
||||||
|
start_loss = loss_fct(start_logits, start_positions)
|
||||||
|
end_loss = loss_fct(end_logits, end_positions)
|
||||||
|
total_loss = (start_loss + end_loss) / 2
|
||||||
|
outputs = (total_loss,) + outputs
|
||||||
|
|
||||||
|
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
|
||||||
@@ -27,6 +27,9 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi
|
|||||||
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
||||||
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
||||||
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
||||||
|
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||||
|
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||||
|
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
|
||||||
|
|
||||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||||
|
|
||||||
@@ -48,14 +51,16 @@ class AutoModel(object):
|
|||||||
The base model class to instantiate is selected as the first pattern matching
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertModel (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||||
- contains `bert`: BertModel (Bert model)
|
- contains `bert`: BertModel (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
|
||||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetModel (XLNet model)
|
- contains `xlnet`: XLNetModel (XLNet model)
|
||||||
- contains `xlm`: XLMModel (XLM model)
|
- contains `xlm`: XLMModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||||
|
|
||||||
This class cannot be instantiated using `__init__()` (throws an error).
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
"""
|
"""
|
||||||
@@ -71,14 +76,16 @@ class AutoModel(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertModel (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||||
- contains `bert`: BertModel (Bert model)
|
- contains `bert`: BertModel (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
|
||||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetModel (XLNet model)
|
- contains `xlnet`: XLNetModel (XLNet model)
|
||||||
- contains `xlm`: XLMModel (XLM model)
|
- contains `xlm`: XLMModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||||
|
|
||||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
To train the model, you should first set it back in training mode with `model.train()`
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
@@ -112,6 +119,9 @@ class AutoModel(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -138,6 +148,10 @@ class AutoModel(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
|
return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -156,7 +170,7 @@ class AutoModel(object):
|
|||||||
return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
"'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path))
|
"'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
class AutoModelWithLMHead(object):
|
class AutoModelWithLMHead(object):
|
||||||
@@ -172,14 +186,16 @@ class AutoModelWithLMHead(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||||
- contains `bert`: BertForMaskedLM (Bert model)
|
- contains `bert`: BertForMaskedLM (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
- contains `ctrl`: CTRLLMModel (Salesforce CTRL model)
|
|
||||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)
|
||||||
|
|
||||||
This class cannot be instantiated using `__init__()` (throws an error).
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
"""
|
"""
|
||||||
@@ -198,6 +214,8 @@ class AutoModelWithLMHead(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||||
- contains `bert`: BertForMaskedLM (Bert model)
|
- contains `bert`: BertForMaskedLM (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
@@ -205,6 +223,7 @@ class AutoModelWithLMHead(object):
|
|||||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)
|
||||||
|
|
||||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
To train the model, you should first set it back in training mode with `model.train()`
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
@@ -237,6 +256,8 @@ class AutoModelWithLMHead(object):
|
|||||||
|
|
||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
@@ -264,6 +285,10 @@ class AutoModelWithLMHead(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
|
return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -282,7 +307,7 @@ class AutoModelWithLMHead(object):
|
|||||||
return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
"'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path))
|
"'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForSequenceClassification(object):
|
class AutoModelForSequenceClassification(object):
|
||||||
@@ -298,6 +323,8 @@ class AutoModelForSequenceClassification(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForSequenceClassification (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||||
@@ -320,6 +347,8 @@ class AutoModelForSequenceClassification(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForSequenceClassification (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||||
@@ -357,6 +386,9 @@ class AutoModelForSequenceClassification(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -383,6 +415,10 @@ class AutoModelForSequenceClassification(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
|
return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -393,7 +429,7 @@ class AutoModelForSequenceClassification(object):
|
|||||||
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
"'bert', 'xlnet', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForQuestionAnswering(object):
|
class AutoModelForQuestionAnswering(object):
|
||||||
@@ -409,6 +445,7 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForQuestionAnswering (ALBERT model)
|
||||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||||
@@ -430,6 +467,7 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForQuestionAnswering (ALBERT model)
|
||||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||||
@@ -492,6 +530,8 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'xlnet' in pretrained_model_name_or_path:
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
@@ -500,4 +540,4 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
|
"'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|||||||
@@ -138,7 +138,11 @@ def swish(x):
|
|||||||
return x * torch.sigmoid(x)
|
return x * torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
|
def mish(x):
|
||||||
|
return x * torch.tanh(nn.functional.softplus(x))
|
||||||
|
|
||||||
|
|
||||||
|
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
|
||||||
|
|
||||||
|
|
||||||
BertLayerNorm = torch.nn.LayerNorm
|
BertLayerNorm = torch.nn.LayerNorm
|
||||||
@@ -278,7 +282,7 @@ class BertAttention(nn.Module):
|
|||||||
if len(heads) == 0:
|
if len(heads) == 0:
|
||||||
return
|
return
|
||||||
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
||||||
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
# Compute how many pruned heads are before the head and move the index accordingly
|
# Compute how many pruned heads are before the head and move the index accordingly
|
||||||
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
@@ -597,7 +601,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertModel.from_pretrained('bert-base-uncased')
|
model = BertModel.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
@@ -656,8 +660,6 @@ class BertModel(BertPreTrainedModel):
|
|||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = torch.ones(input_shape, device=device)
|
attention_mask = torch.ones(input_shape, device=device)
|
||||||
if encoder_attention_mask is None:
|
|
||||||
encoder_attention_mask = torch.ones(input_shape, device=device)
|
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||||
|
|
||||||
@@ -665,11 +667,10 @@ class BertModel(BertPreTrainedModel):
|
|||||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||||
if attention_mask.dim() == 3:
|
if attention_mask.dim() == 3:
|
||||||
extended_attention_mask = attention_mask[:, None, :, :]
|
extended_attention_mask = attention_mask[:, None, :, :]
|
||||||
|
elif attention_mask.dim() == 2:
|
||||||
# Provided a padding mask of dimensions [batch_size, seq_length]
|
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||||
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||||
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if attention_mask.dim() == 2:
|
|
||||||
if self.config.is_decoder:
|
if self.config.is_decoder:
|
||||||
batch_size, seq_length = input_shape
|
batch_size, seq_length = input_shape
|
||||||
seq_ids = torch.arange(seq_length, device=device)
|
seq_ids = torch.arange(seq_length, device=device)
|
||||||
@@ -677,6 +678,8 @@ class BertModel(BertPreTrainedModel):
|
|||||||
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
||||||
else:
|
else:
|
||||||
extended_attention_mask = attention_mask[:, None, None, :]
|
extended_attention_mask = attention_mask[:, None, None, :]
|
||||||
|
else:
|
||||||
|
raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
|
||||||
|
|
||||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
# masked positions, this operation will create a tensor which is 0.0 for
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
@@ -688,13 +691,22 @@ class BertModel(BertPreTrainedModel):
|
|||||||
|
|
||||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if encoder_attention_mask.dim() == 3:
|
if self.config.is_decoder:
|
||||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
if encoder_attention_mask is None:
|
||||||
if encoder_attention_mask.dim() == 2:
|
encoder_attention_mask = torch.ones(input_shape, device=device)
|
||||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
|
||||||
|
|
||||||
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
if encoder_attention_mask.dim() == 3:
|
||||||
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||||
|
elif encoder_attention_mask.dim() == 2:
|
||||||
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
||||||
|
else:
|
||||||
|
raise ValueError("Wrong shape for input_ids (shape {}) or encoder_attention_mask (shape {})".format(input_shape,
|
||||||
|
encoder_attention_mask.shape))
|
||||||
|
|
||||||
|
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
|
||||||
|
else:
|
||||||
|
encoder_extended_attention_mask = None
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
@@ -760,7 +772,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForPreTraining.from_pretrained('bert-base-uncased')
|
model = BertForPreTraining.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
prediction_scores, seq_relationship_scores = outputs[:2]
|
prediction_scores, seq_relationship_scores = outputs[:2]
|
||||||
|
|
||||||
@@ -836,7 +848,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||||
loss, prediction_scores = outputs[:2]
|
loss, prediction_scores = outputs[:2]
|
||||||
|
|
||||||
@@ -919,7 +931,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
|
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
seq_relationship_scores = outputs[0]
|
seq_relationship_scores = outputs[0]
|
||||||
|
|
||||||
@@ -984,7 +996,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss, logits = outputs[:2]
|
loss, logits = outputs[:2]
|
||||||
@@ -1060,7 +1072,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
|
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
|
||||||
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
|
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
|
||||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||||
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss, classification_scores = outputs[:2]
|
loss, classification_scores = outputs[:2]
|
||||||
@@ -1134,7 +1146,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
|
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss, scores = outputs[:2]
|
loss, scores = outputs[:2]
|
||||||
|
|||||||
@@ -63,7 +63,8 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
|
|||||||
scaled_attention_logits = matmul_qk / np.sqrt(dk)
|
scaled_attention_logits = matmul_qk / np.sqrt(dk)
|
||||||
|
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
scaled_attention_logits += (mask * -1e4)
|
nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
|
||||||
|
scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4)
|
||||||
|
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
# Apply the attention mask
|
# Apply the attention mask
|
||||||
@@ -251,7 +252,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
|||||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
should not be passed as input ids as they have already been computed.
|
should not be passed as input ids as they have already been computed.
|
||||||
@@ -373,7 +374,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
|||||||
inputs_embeds = self.w(input_ids)
|
inputs_embeds = self.w(input_ids)
|
||||||
# inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
|
# inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
|
||||||
seq_len = input_shape[-1]
|
seq_len = input_shape[-1]
|
||||||
mask = torch.triu(torch.ones(seq_len, seq_len), 1).to(inputs_embeds.device)
|
mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(inputs_embeds.device)
|
||||||
|
|
||||||
inputs_embeds *= np.sqrt(self.d_model_size)
|
inputs_embeds *= np.sqrt(self.d_model_size)
|
||||||
|
|
||||||
@@ -437,7 +438,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
|||||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
should not be passed as input ids as they have already been computed.
|
should not be passed as input ids as they have already been computed.
|
||||||
|
|||||||
@@ -42,7 +42,9 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
|
||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
|
||||||
|
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
|
||||||
|
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -329,7 +329,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
should not be passed as input ids as they have already been computed.
|
should not be passed as input ids as they have already been computed.
|
||||||
@@ -503,7 +503,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
should not be passed as input ids as they have already been computed.
|
should not be passed as input ids as they have already been computed.
|
||||||
@@ -596,7 +596,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
|
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
|
||||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
should not be passed as input ids as they have already been computed.
|
should not be passed as input ids as they have already been computed.
|
||||||
|
|||||||
@@ -50,8 +50,10 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
|||||||
|
|
||||||
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
|
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
|
||||||
|
|
||||||
names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
|
with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
|
||||||
shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
|
names = json.load(names_handle)
|
||||||
|
with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
|
||||||
|
shapes = json.load(shapes_handle)
|
||||||
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
||||||
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
|
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
|
||||||
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
||||||
|
|||||||
794
transformers/modeling_tf_albert.py
Normal file
794
transformers/modeling_tf_albert.py
Normal file
@@ -0,0 +1,794 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 ALBERT model. """
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .configuration_albert import AlbertConfig
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||||
|
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
|
||||||
|
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
|
||||||
|
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
|
||||||
|
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
|
||||||
|
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
|
||||||
|
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
|
||||||
|
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
|
||||||
|
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
||||||
|
"""Construct the embeddings from word, position and token_type embeddings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertEmbeddings, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
|
||||||
|
config.embedding_size,
|
||||||
|
embeddings_initializer=get_initializer(
|
||||||
|
self.config.initializer_range),
|
||||||
|
name='position_embeddings')
|
||||||
|
self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
|
||||||
|
config.embedding_size,
|
||||||
|
embeddings_initializer=get_initializer(
|
||||||
|
self.config.initializer_range),
|
||||||
|
name='token_type_embeddings')
|
||||||
|
|
||||||
|
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
|
||||||
|
# any TensorFlow checkpoint file
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
"""Build shared word embedding layer """
|
||||||
|
with tf.name_scope("word_embeddings"):
|
||||||
|
# Create and initialize weights. The random normal initializer was chosen
|
||||||
|
# arbitrarily, and works well.
|
||||||
|
self.word_embeddings = self.add_weight(
|
||||||
|
"weight",
|
||||||
|
shape=[self.config.vocab_size, self.config.embedding_size],
|
||||||
|
initializer=get_initializer(self.config.initializer_range))
|
||||||
|
super(TFAlbertEmbeddings, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, inputs, mode="embedding", training=False):
|
||||||
|
"""Get token embeddings of inputs.
|
||||||
|
Args:
|
||||||
|
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
|
||||||
|
mode: string, a valid value is one of "embedding" and "linear".
|
||||||
|
Returns:
|
||||||
|
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
|
||||||
|
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
|
||||||
|
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||||
|
Raises:
|
||||||
|
ValueError: if mode is not valid.
|
||||||
|
|
||||||
|
Shared weights logic adapted from
|
||||||
|
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||||
|
"""
|
||||||
|
if mode == "embedding":
|
||||||
|
return self._embedding(inputs, training=training)
|
||||||
|
elif mode == "linear":
|
||||||
|
return self._linear(inputs)
|
||||||
|
else:
|
||||||
|
raise ValueError("mode {} is not valid.".format(mode))
|
||||||
|
|
||||||
|
def _embedding(self, inputs, training=False):
|
||||||
|
"""Applies embedding based on inputs tensor."""
|
||||||
|
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||||
|
|
||||||
|
if input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
else:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
|
||||||
|
seq_length = input_shape[1]
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
if token_type_ids is None:
|
||||||
|
token_type_ids = tf.fill(input_shape, 0)
|
||||||
|
|
||||||
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
|
||||||
|
position_embeddings = self.position_embeddings(position_ids)
|
||||||
|
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||||
|
|
||||||
|
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
|
||||||
|
embeddings = self.LayerNorm(embeddings)
|
||||||
|
embeddings = self.dropout(embeddings, training=training)
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
def _linear(self, inputs):
|
||||||
|
"""Computes logits by running inputs through a linear layer.
|
||||||
|
Args:
|
||||||
|
inputs: A float32 tensor with shape [batch_size, length, embedding_size]
|
||||||
|
Returns:
|
||||||
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
|
"""
|
||||||
|
batch_size = shape_list(inputs)[0]
|
||||||
|
length = shape_list(inputs)[1]
|
||||||
|
x = tf.reshape(inputs, [-1, self.config.embedding_size])
|
||||||
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
|
return tf.reshape(logits, [batch_size, length, self.config.vocab_size])
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertSelfAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertSelfAttention, self).__init__(**kwargs)
|
||||||
|
if config.hidden_size % config.num_attention_heads != 0:
|
||||||
|
raise ValueError(
|
||||||
|
"The hidden size (%d) is not a multiple of the number of attention "
|
||||||
|
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
self.num_attention_heads = config.num_attention_heads
|
||||||
|
assert config.hidden_size % config.num_attention_heads == 0
|
||||||
|
self.attention_head_size = int(
|
||||||
|
config.hidden_size / config.num_attention_heads)
|
||||||
|
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||||
|
|
||||||
|
self.query = tf.keras.layers.Dense(self.all_head_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='query')
|
||||||
|
self.key = tf.keras.layers.Dense(self.all_head_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='key')
|
||||||
|
self.value = tf.keras.layers.Dense(self.all_head_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='value')
|
||||||
|
|
||||||
|
self.dropout = tf.keras.layers.Dropout(
|
||||||
|
config.attention_probs_dropout_prob)
|
||||||
|
|
||||||
|
def transpose_for_scores(self, x, batch_size):
|
||||||
|
x = tf.reshape(
|
||||||
|
x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||||
|
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
batch_size = shape_list(hidden_states)[0]
|
||||||
|
mixed_query_layer = self.query(hidden_states)
|
||||||
|
mixed_key_layer = self.key(hidden_states)
|
||||||
|
mixed_value_layer = self.value(hidden_states)
|
||||||
|
|
||||||
|
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||||
|
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||||
|
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||||
|
|
||||||
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
|
# (batch size, num_heads, seq_len_q, seq_len_k)
|
||||||
|
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
|
||||||
|
# scale attention_scores
|
||||||
|
dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
|
||||||
|
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
|
# Normalize the attention scores to probabilities.
|
||||||
|
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||||
|
|
||||||
|
# This is actually dropping out entire tokens to attend to, which might
|
||||||
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||||
|
attention_probs = self.dropout(attention_probs, training=training)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
attention_probs = attention_probs * head_mask
|
||||||
|
|
||||||
|
context_layer = tf.matmul(attention_probs, value_layer)
|
||||||
|
|
||||||
|
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||||
|
context_layer = tf.reshape(context_layer,
|
||||||
|
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||||
|
|
||||||
|
outputs = (context_layer, attention_probs) if self.output_attentions else (
|
||||||
|
context_layer,)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertSelfOutput(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertSelfOutput, self).__init__(**kwargs)
|
||||||
|
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='dense')
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, input_tensor = inputs
|
||||||
|
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.dropout(hidden_states, training=training)
|
||||||
|
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertAttention(TFBertSelfAttention):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertAttention, self).__init__(config, **kwargs)
|
||||||
|
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='dense')
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
input_tensor, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
batch_size = shape_list(input_tensor)[0]
|
||||||
|
mixed_query_layer = self.query(input_tensor)
|
||||||
|
mixed_key_layer = self.key(input_tensor)
|
||||||
|
mixed_value_layer = self.value(input_tensor)
|
||||||
|
|
||||||
|
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||||
|
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||||
|
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||||
|
|
||||||
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
|
# (batch size, num_heads, seq_len_q, seq_len_k)
|
||||||
|
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
|
||||||
|
# scale attention_scores
|
||||||
|
dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
|
||||||
|
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
|
# Normalize the attention scores to probabilities.
|
||||||
|
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||||
|
|
||||||
|
# This is actually dropping out entire tokens to attend to, which might
|
||||||
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||||
|
attention_probs = self.dropout(attention_probs, training=training)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
attention_probs = attention_probs * head_mask
|
||||||
|
|
||||||
|
context_layer = tf.matmul(attention_probs, value_layer)
|
||||||
|
|
||||||
|
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||||
|
context_layer = tf.reshape(context_layer,
|
||||||
|
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||||
|
|
||||||
|
self_outputs = (context_layer, attention_probs) if self.output_attentions else (
|
||||||
|
context_layer,)
|
||||||
|
|
||||||
|
hidden_states = self_outputs[0]
|
||||||
|
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.dropout(hidden_states, training=training)
|
||||||
|
attention_output = self.LayerNorm(hidden_states + input_tensor)
|
||||||
|
|
||||||
|
# add attentions if we output them
|
||||||
|
outputs = (attention_output,) + self_outputs[1:]
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertLayer, self).__init__(**kwargs)
|
||||||
|
self.attention = TFAlbertAttention(config, name='attention')
|
||||||
|
|
||||||
|
self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), name='ffn')
|
||||||
|
|
||||||
|
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
else:
|
||||||
|
self.activation = config.hidden_act
|
||||||
|
|
||||||
|
self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), name='ffn_output')
|
||||||
|
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
attention_outputs = self.attention(
|
||||||
|
[hidden_states, attention_mask, head_mask], training=training)
|
||||||
|
ffn_output = self.ffn(attention_outputs[0])
|
||||||
|
ffn_output = self.activation(ffn_output)
|
||||||
|
ffn_output = self.ffn_output(ffn_output)
|
||||||
|
|
||||||
|
hidden_states = self.dropout(hidden_states, training=training)
|
||||||
|
hidden_states = self.full_layer_layer_norm(
|
||||||
|
ffn_output + attention_outputs[0])
|
||||||
|
|
||||||
|
# add attentions if we output them
|
||||||
|
outputs = (hidden_states,) + attention_outputs[1:]
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertLayerGroup(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertLayerGroup, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
|
||||||
|
i)) for i in range(config.inner_group_num)]
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
layer_hidden_states = ()
|
||||||
|
layer_attentions = ()
|
||||||
|
|
||||||
|
for layer_index, albert_layer in enumerate(self.albert_layers):
|
||||||
|
layer_output = albert_layer(
|
||||||
|
[hidden_states, attention_mask, head_mask[layer_index]], training=training)
|
||||||
|
hidden_states = layer_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
layer_attentions = layer_attentions + (layer_output[1],)
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
layer_hidden_states = layer_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (layer_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (layer_attentions,)
|
||||||
|
# last-layer hidden state, (layer hidden states), (layer attentions)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertTransformer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertTransformer, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), name='embedding_hidden_mapping_in')
|
||||||
|
self.albert_layer_groups = [TFAlbertLayerGroup(
|
||||||
|
config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
|
||||||
|
all_attentions = ()
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = (hidden_states,)
|
||||||
|
|
||||||
|
for i in range(self.config.num_hidden_layers):
|
||||||
|
# Number of layers in a hidden group
|
||||||
|
layers_per_group = int(
|
||||||
|
self.config.num_hidden_layers / self.config.num_hidden_groups)
|
||||||
|
|
||||||
|
# Index of the hidden group
|
||||||
|
group_idx = int(
|
||||||
|
i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
|
||||||
|
|
||||||
|
layer_group_output = self.albert_layer_groups[group_idx](
|
||||||
|
[hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
|
||||||
|
hidden_states = layer_group_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions = all_attentions + layer_group_output[-1]
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
|
||||||
|
# last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertPreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = AlbertConfig
|
||||||
|
pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
base_model_prefix = "albert"
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertMLMHead(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
|
super(TFAlbertMLMHead, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
self.dense = tf.keras.layers.Dense(config.embedding_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='dense')
|
||||||
|
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
else:
|
||||||
|
self.activation = config.hidden_act
|
||||||
|
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
|
||||||
|
# The output weights are the same as the input embeddings, but there is
|
||||||
|
# an output-only bias for each token.
|
||||||
|
self.decoder = input_embeddings
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
self.bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='bias')
|
||||||
|
self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='decoder/bias')
|
||||||
|
super(TFAlbertMLMHead, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, hidden_states):
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.activation(hidden_states)
|
||||||
|
hidden_states = self.LayerNorm(hidden_states)
|
||||||
|
hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias
|
||||||
|
hidden_states = hidden_states + self.bias
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in
|
||||||
|
`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
|
||||||
|
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
|
||||||
|
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
|
||||||
|
https://arxiv.org/abs/1909.11942
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ALBERT_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, ALBERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
|
Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
|
||||||
|
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `ALBERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFAlbertModel(TFAlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
|
||||||
|
Last layer hidden-state of the first token of the sequence (classification token)
|
||||||
|
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||||
|
layer weights are trained from the next sentence prediction (classification)
|
||||||
|
objective during Albert pretraining. This output is usually *not* a good summary
|
||||||
|
of the semantic content of the input, you're often better with averaging or pooling
|
||||||
|
the sequence of hidden-states for the whole input sequence.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import AlbertTokenizer, TFAlbertModel
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
model = TFAlbertModel.from_pretrained('bert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertModel, self).__init__(config, **kwargs)
|
||||||
|
self.num_hidden_layers = config.num_hidden_layers
|
||||||
|
|
||||||
|
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
|
||||||
|
self.encoder = TFAlbertTransformer(config, name="encoder")
|
||||||
|
self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), activation='tanh', name='pooler')
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.embeddings
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
""" Prunes heads of the model.
|
||||||
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
|
See base class PreTrainedModel
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
|
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
||||||
|
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
||||||
|
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
||||||
|
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = tf.fill(input_shape, 1)
|
||||||
|
if token_type_ids is None:
|
||||||
|
token_type_ids = tf.fill(input_shape, 0)
|
||||||
|
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
|
||||||
|
extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
|
||||||
|
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if not head_mask is None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||||
|
|
||||||
|
embedding_output = self.embeddings(
|
||||||
|
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
|
||||||
|
encoder_outputs = self.encoder(
|
||||||
|
[embedding_output, extended_attention_mask, head_mask], training=training)
|
||||||
|
|
||||||
|
sequence_output = encoder_outputs[0]
|
||||||
|
pooled_output = self.pooler(sequence_output[:, 0])
|
||||||
|
|
||||||
|
# add hidden_states and attentions if they are here
|
||||||
|
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]
|
||||||
|
# sequence_output, pooled_output, (hidden_states), (attentions)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import AlbertTokenizer, TFAlbertForMaskedLM
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
prediction_scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
|
self.albert = TFAlbertModel(config, name='albert')
|
||||||
|
self.predictions = TFAlbertMLMHead(
|
||||||
|
config, self.albert.embeddings, name='predictions')
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.albert.embeddings
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.albert(inputs, **kwargs)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
prediction_scores = self.predictions(
|
||||||
|
sequence_output, training=kwargs.get('training', False))
|
||||||
|
|
||||||
|
# Add hidden states and attention if they are here
|
||||||
|
outputs = (prediction_scores,) + outputs[2:]
|
||||||
|
|
||||||
|
return outputs # prediction_scores, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||||
|
the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.albert = TFAlbertModel(config, name='albert')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='classifier')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.albert(inputs, **kwargs)
|
||||||
|
|
||||||
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
|
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
|
||||||
|
logits = self.classifier(pooled_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
|
||||||
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
@@ -109,6 +109,9 @@ class TFAutoModel(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -237,6 +240,9 @@ class TFAutoModelWithLMHead(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -360,6 +366,9 @@ class TFAutoModelForSequenceClassification(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -472,6 +481,9 @@ class TFAutoModelForQuestionAnswering(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import numpy as np
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from .configuration_bert import BertConfig
|
from .configuration_bert import BertConfig
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||||
from .file_utils import add_start_docstrings
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -145,9 +145,9 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||||
|
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
input_shape = tf.shape(input_ids)
|
input_shape = shape_list(input_ids)
|
||||||
else:
|
else:
|
||||||
input_shape = tf.shape(inputs_embeds)[:-1]
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
|
||||||
seq_length = input_shape[1]
|
seq_length = input_shape[1]
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
@@ -172,8 +172,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
Returns:
|
Returns:
|
||||||
float32 tensor with shape [batch_size, length, vocab_size].
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
"""
|
"""
|
||||||
batch_size = tf.shape(inputs)[0]
|
batch_size = shape_list(inputs)[0]
|
||||||
length = tf.shape(inputs)[1]
|
length = shape_list(inputs)[1]
|
||||||
|
|
||||||
x = tf.reshape(inputs, [-1, self.hidden_size])
|
x = tf.reshape(inputs, [-1, self.hidden_size])
|
||||||
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
@@ -214,7 +214,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||||||
def call(self, inputs, training=False):
|
def call(self, inputs, training=False):
|
||||||
hidden_states, attention_mask, head_mask = inputs
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
batch_size = tf.shape(hidden_states)[0]
|
batch_size = shape_list(hidden_states)[0]
|
||||||
mixed_query_layer = self.query(hidden_states)
|
mixed_query_layer = self.query(hidden_states)
|
||||||
mixed_key_layer = self.key(hidden_states)
|
mixed_key_layer = self.key(hidden_states)
|
||||||
mixed_value_layer = self.value(hidden_states)
|
mixed_value_layer = self.value(hidden_states)
|
||||||
@@ -225,7 +225,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k)
|
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k)
|
||||||
dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) # scale attention_scores
|
dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores
|
||||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||||
|
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -502,9 +502,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
|||||||
if input_ids is not None and inputs_embeds is not None:
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
elif input_ids is not None:
|
elif input_ids is not None:
|
||||||
input_shape = input_ids.shape
|
input_shape = shape_list(input_ids)
|
||||||
elif inputs_embeds is not None:
|
elif inputs_embeds is not None:
|
||||||
input_shape = inputs_embeds.shape[:-1]
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
else:
|
else:
|
||||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
@@ -939,11 +939,11 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
|
|||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
num_choices = tf.shape(input_ids)[1]
|
num_choices = shape_list(input_ids)[1]
|
||||||
seq_length = tf.shape(input_ids)[2]
|
seq_length = shape_list(input_ids)[2]
|
||||||
else:
|
else:
|
||||||
num_choices = tf.shape(inputs_embeds)[1]
|
num_choices = shape_list(inputs_embeds)[1]
|
||||||
seq_length = tf.shape(inputs_embeds)[2]
|
seq_length = shape_list(inputs_embeds)[2]
|
||||||
|
|
||||||
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
|
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
|
||||||
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def call(self, inputs, training=False):
|
def call(self, inputs, training=False):
|
||||||
v, k, q, mask, layer_past, attention_mask, head_mask = inputs
|
v, k, q, mask, layer_past, attention_mask, head_mask = inputs
|
||||||
batch_size = q.shape[0]
|
batch_size = shape_list(q)[0]
|
||||||
|
|
||||||
q = self.Wq(q)
|
q = self.Wq(q)
|
||||||
k = self.Wk(k)
|
k = self.Wk(k)
|
||||||
@@ -400,7 +400,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
|
|||||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -462,7 +462,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
|
|||||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
|||||||
@@ -37,7 +37,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
|
||||||
|
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -137,9 +138,9 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
input_ids, position_ids = inputs
|
input_ids, position_ids = inputs
|
||||||
|
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
seq_length = tf.shape(input_ids)[1]
|
seq_length = shape_list(input_ids)[1]
|
||||||
else:
|
else:
|
||||||
seq_length = tf.shape(inputs_embeds)[1]
|
seq_length = shape_list(inputs_embeds)[1]
|
||||||
|
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
@@ -160,8 +161,8 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
Returns:
|
Returns:
|
||||||
float32 tensor with shape [batch_size, length, vocab_size].
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
"""
|
"""
|
||||||
batch_size = tf.shape(inputs)[0]
|
batch_size = shape_list(inputs)[0]
|
||||||
length = tf.shape(inputs)[1]
|
length = shape_list(inputs)[1]
|
||||||
|
|
||||||
x = tf.reshape(inputs, [-1, self.dim])
|
x = tf.reshape(inputs, [-1, self.dim])
|
||||||
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
@@ -703,6 +704,53 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
|||||||
return outputs # logits, (hidden_states), (attentions)
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
|
||||||
|
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||||
|
Classification scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
Examples::
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
model = TFDistilBertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
scores = outputs[0]
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='classifier')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
|
||||||
|
logits = self.classifier(sequence_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
|
||||||
|
return outputs # scores, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||||
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ class TFAttention(tf.keras.layers.Layer):
|
|||||||
# q, k, v have shape [batch, heads, sequence, features]
|
# q, k, v have shape [batch, heads, sequence, features]
|
||||||
w = tf.matmul(q, k, transpose_b=True)
|
w = tf.matmul(q, k, transpose_b=True)
|
||||||
if self.scale:
|
if self.scale:
|
||||||
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
|
dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
|
||||||
w = w / tf.math.sqrt(dk)
|
w = w / tf.math.sqrt(dk)
|
||||||
|
|
||||||
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
||||||
@@ -436,7 +436,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
|
|||||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -476,7 +476,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
|
|||||||
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -535,7 +535,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
|
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
|
||||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ class TFAttention(tf.keras.layers.Layer):
|
|||||||
# q, k, v have shape [batch, heads, sequence, features]
|
# q, k, v have shape [batch, heads, sequence, features]
|
||||||
w = tf.matmul(q, k, transpose_b=True)
|
w = tf.matmul(q, k, transpose_b=True)
|
||||||
if self.scale:
|
if self.scale:
|
||||||
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
|
dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
|
||||||
w = w / tf.math.sqrt(dk)
|
w = w / tf.math.sqrt(dk)
|
||||||
|
|
||||||
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
||||||
|
|||||||
@@ -118,6 +118,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
|||||||
new_key = key.replace('gamma', 'weight')
|
new_key = key.replace('gamma', 'weight')
|
||||||
if 'beta' in key:
|
if 'beta' in key:
|
||||||
new_key = key.replace('beta', 'bias')
|
new_key = key.replace('beta', 'bias')
|
||||||
|
# DialoGPT format
|
||||||
|
if key == 'lm_head.decoder.weight':
|
||||||
|
new_key = 'lm_head.weight'
|
||||||
if new_key:
|
if new_key:
|
||||||
old_keys.append(key)
|
old_keys.append(key)
|
||||||
new_keys.append(new_key)
|
new_keys.append(new_key)
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ import numpy as np
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from .configuration_roberta import RobertaConfig
|
from .configuration_roberta import RobertaConfig
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||||
from .file_utils import add_start_docstrings
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
|
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
|
||||||
@@ -51,9 +51,9 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
|
|||||||
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||||
|
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
seq_length = tf.shape(input_ids)[1]
|
seq_length = shape_list(input_ids)[1]
|
||||||
else:
|
else:
|
||||||
seq_length = tf.shape(inputs_embeds)[1]
|
seq_length = shape_list(inputs_embeds)[1]
|
||||||
|
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
|
position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|||||||
@@ -337,7 +337,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
|
|||||||
emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
|
emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
|
||||||
|
|
||||||
mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
|
mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
|
||||||
emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64))
|
emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64))
|
||||||
|
|
||||||
embed_shape = shape_list(inp) + [self.d_proj]
|
embed_shape = shape_list(inp) + [self.d_proj]
|
||||||
embed = tf.reshape(emb_flat, embed_shape)
|
embed = tf.reshape(emb_flat, embed_shape)
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _gather_logprob(logprob, target):
|
def _gather_logprob(logprob, target):
|
||||||
lp_size = tf.shape(logprob)
|
lp_size = shape_list(logprob)
|
||||||
r = tf.range(lp_size[0])
|
r = tf.range(lp_size[0])
|
||||||
idx = tf.stack([r, target], 1)
|
idx = tf.stack([r, target], 1)
|
||||||
return tf.gather_nd(logprob, idx)
|
return tf.gather_nd(logprob, idx)
|
||||||
@@ -159,7 +159,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
|
cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
|
||||||
cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
|
cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
|
||||||
if target is not None:
|
if target is not None:
|
||||||
loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64))
|
loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64))
|
||||||
out = tf.concat(out, axis=-1)
|
out = tf.concat(out, axis=-1)
|
||||||
|
|
||||||
if target is not None:
|
if target is not None:
|
||||||
|
|||||||
@@ -51,7 +51,15 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
config_class = None
|
config_class = None
|
||||||
pretrained_model_archive_map = {}
|
pretrained_model_archive_map = {}
|
||||||
base_model_prefix = ""
|
base_model_prefix = ""
|
||||||
dummy_inputs = tf.constant(DUMMY_INPUTS) # dummy inputs to build the network
|
|
||||||
|
@property
|
||||||
|
def dummy_inputs(self):
|
||||||
|
""" Dummy inputs to build the network.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tf.Tensor with dummy inputs
|
||||||
|
"""
|
||||||
|
return tf.constant(DUMMY_INPUTS)
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
|
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||||
@@ -191,6 +199,9 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -216,6 +227,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
cache_dir = kwargs.pop('cache_dir', None)
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
from_pt = kwargs.pop('from_pt', False)
|
from_pt = kwargs.pop('from_pt', False)
|
||||||
force_download = kwargs.pop('force_download', False)
|
force_download = kwargs.pop('force_download', False)
|
||||||
|
resume_download = kwargs.pop('resume_download', False)
|
||||||
proxies = kwargs.pop('proxies', None)
|
proxies = kwargs.pop('proxies', None)
|
||||||
|
|
||||||
# Load config
|
# Load config
|
||||||
@@ -224,6 +236,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
pretrained_model_name_or_path, *model_args,
|
pretrained_model_name_or_path, *model_args,
|
||||||
cache_dir=cache_dir, return_unused_kwargs=True,
|
cache_dir=cache_dir, return_unused_kwargs=True,
|
||||||
force_download=force_download,
|
force_download=force_download,
|
||||||
|
resume_download=resume_download,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -251,7 +264,8 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
|
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
|
||||||
|
resume_download=resume_download, proxies=proxies)
|
||||||
except EnvironmentError as e:
|
except EnvironmentError as e:
|
||||||
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
||||||
logger.error(
|
logger.error(
|
||||||
@@ -454,7 +468,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
|
|||||||
elif self.summary_type == 'first':
|
elif self.summary_type == 'first':
|
||||||
output = hidden_states[:, 0]
|
output = hidden_states[:, 0]
|
||||||
elif self.summary_type == 'mean':
|
elif self.summary_type == 'mean':
|
||||||
output = tf.mean(hidden_states, axis=1)
|
output = tf.reduce_mean(hidden_states, axis=1)
|
||||||
elif self.summary_type == 'cls_index':
|
elif self.summary_type == 'cls_index':
|
||||||
hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims]
|
hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims]
|
||||||
if cls_index is None:
|
if cls_index is None:
|
||||||
|
|||||||
@@ -112,8 +112,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
|||||||
def prune_heads(self, heads):
|
def prune_heads(self, heads):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@staticmethod
|
def rel_shift(self, x, klen=-1):
|
||||||
def rel_shift(x, klen=-1):
|
|
||||||
"""perform relative shift to form the relative attention score."""
|
"""perform relative shift to form the relative attention score."""
|
||||||
x_size = shape_list(x)
|
x_size = shape_list(x)
|
||||||
|
|
||||||
@@ -135,7 +134,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
# position based attention score
|
# position based attention score
|
||||||
bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
|
bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
|
||||||
bd = self.rel_shift(bd, klen=ac.shape[1])
|
bd = self.rel_shift(bd, klen=shape_list(ac)[1])
|
||||||
|
|
||||||
# segment based attention score
|
# segment based attention score
|
||||||
if seg_mat is None:
|
if seg_mat is None:
|
||||||
@@ -192,7 +191,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
|||||||
if g is not None:
|
if g is not None:
|
||||||
###### Two-stream attention with relative positional encoding.
|
###### Two-stream attention with relative positional encoding.
|
||||||
# content based attention score
|
# content based attention score
|
||||||
if mems is not None and mems.shape.ndims > 1:
|
if mems is not None and len(shape_list(mems)) > 1:
|
||||||
cat = tf.concat([mems, h], axis=0)
|
cat = tf.concat([mems, h], axis=0)
|
||||||
else:
|
else:
|
||||||
cat = h
|
cat = h
|
||||||
@@ -252,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
###### Multi-head attention with relative positional encoding
|
###### Multi-head attention with relative positional encoding
|
||||||
if mems is not None and mems.shape.ndims > 1:
|
if mems is not None and len(shape_list(mems)) > 1:
|
||||||
cat = tf.concat([mems, h], axis=0)
|
cat = tf.concat([mems, h], axis=0)
|
||||||
else:
|
else:
|
||||||
cat = h
|
cat = h
|
||||||
@@ -565,7 +564,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
if data_mask is not None:
|
if data_mask is not None:
|
||||||
# all mems can be attended to
|
# all mems can be attended to
|
||||||
mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
|
mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz],
|
||||||
dtype=dtype_float)
|
dtype=dtype_float)
|
||||||
data_mask = tf.concat([mems_mask, data_mask], axis=1)
|
data_mask = tf.concat([mems_mask, data_mask], axis=1)
|
||||||
if attn_mask is None:
|
if attn_mask is None:
|
||||||
@@ -590,7 +589,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
|||||||
word_emb_k = self.word_embedding(input_ids)
|
word_emb_k = self.word_embedding(input_ids)
|
||||||
output_h = self.dropout(word_emb_k, training=training)
|
output_h = self.dropout(word_emb_k, training=training)
|
||||||
if target_mapping is not None:
|
if target_mapping is not None:
|
||||||
word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
|
word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
|
||||||
# else: # We removed the inp_q input which was same as target mapping
|
# else: # We removed the inp_q input which was same as target mapping
|
||||||
# inp_q_ext = inp_q[:, :, None]
|
# inp_q_ext = inp_q[:, :, None]
|
||||||
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
|
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
|
||||||
@@ -939,6 +938,59 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
|
|||||||
return outputs # return logits, (mems), (hidden states), (attentions)
|
return outputs # return logits, (mems), (hidden states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
|
||||||
|
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||||
|
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||||
|
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||||
|
Classification scores (before SoftMax).
|
||||||
|
**mems**: (`optional`, returned when ``config.mem_len > 0``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer):
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
|
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||||
|
See details in the docstring of the `mems` input above.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import XLNetTokenizer, TFXLNetForTokenClassification
|
||||||
|
|
||||||
|
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||||
|
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.transformer = TFXLNetMainLayer(config, name='transformer')
|
||||||
|
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='classifier')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
transformer_outputs = self.transformer(inputs, **kwargs)
|
||||||
|
output = transformer_outputs[0]
|
||||||
|
|
||||||
|
logits = self.classifier(output)
|
||||||
|
|
||||||
|
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
|
||||||
|
|
||||||
|
return outputs # return logits, (mems), (hidden states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||||
# the hidden-states output to compute `span start logits` and `span end logits`). """,
|
# the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||||
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||||
|
|||||||
@@ -291,6 +291,9 @@ class PreTrainedModel(nn.Module):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -315,11 +318,16 @@ class PreTrainedModel(nn.Module):
|
|||||||
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
|
||||||
|
logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
|
||||||
|
"https://github.com/google-research/google-research/issues/119 for more information.")
|
||||||
|
|
||||||
config = kwargs.pop('config', None)
|
config = kwargs.pop('config', None)
|
||||||
state_dict = kwargs.pop('state_dict', None)
|
state_dict = kwargs.pop('state_dict', None)
|
||||||
cache_dir = kwargs.pop('cache_dir', None)
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
from_tf = kwargs.pop('from_tf', False)
|
from_tf = kwargs.pop('from_tf', False)
|
||||||
force_download = kwargs.pop('force_download', False)
|
force_download = kwargs.pop('force_download', False)
|
||||||
|
resume_download = kwargs.pop('resume_download', False)
|
||||||
proxies = kwargs.pop('proxies', None)
|
proxies = kwargs.pop('proxies', None)
|
||||||
output_loading_info = kwargs.pop('output_loading_info', False)
|
output_loading_info = kwargs.pop('output_loading_info', False)
|
||||||
|
|
||||||
@@ -329,6 +337,7 @@ class PreTrainedModel(nn.Module):
|
|||||||
pretrained_model_name_or_path, *model_args,
|
pretrained_model_name_or_path, *model_args,
|
||||||
cache_dir=cache_dir, return_unused_kwargs=True,
|
cache_dir=cache_dir, return_unused_kwargs=True,
|
||||||
force_download=force_download,
|
force_download=force_download,
|
||||||
|
resume_download=resume_download,
|
||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
@@ -361,7 +370,8 @@ class PreTrainedModel(nn.Module):
|
|||||||
|
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
|
||||||
|
proxies=proxies, resume_download=resume_download)
|
||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
||||||
msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
|
msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
|
||||||
@@ -417,6 +427,8 @@ class PreTrainedModel(nn.Module):
|
|||||||
new_key = key.replace('gamma', 'weight')
|
new_key = key.replace('gamma', 'weight')
|
||||||
if 'beta' in key:
|
if 'beta' in key:
|
||||||
new_key = key.replace('beta', 'bias')
|
new_key = key.replace('beta', 'bias')
|
||||||
|
if key == 'lm_head.decoder.weight':
|
||||||
|
new_key = 'lm_head.weight'
|
||||||
if new_key:
|
if new_key:
|
||||||
old_keys.append(key)
|
old_keys.append(key)
|
||||||
new_keys.append(new_key)
|
new_keys.append(new_key)
|
||||||
@@ -728,7 +740,7 @@ class SequenceSummary(nn.Module):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(SequenceSummary, self).__init__()
|
super(SequenceSummary, self).__init__()
|
||||||
|
|
||||||
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
|
self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
|
||||||
if self.summary_type == 'attn':
|
if self.summary_type == 'attn':
|
||||||
# We should use a standard multi-head attention module with absolute positional embedding for that.
|
# We should use a standard multi-head attention module with absolute positional embedding for that.
|
||||||
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
|
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
|
||||||
|
|||||||
@@ -583,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -878,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
|
hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
|
||||||
outputs = outputs + (hidden_states,)
|
outputs = outputs + (hidden_states,)
|
||||||
if self.output_attentions:
|
if self.output_attentions:
|
||||||
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
|
if target_mapping is not None:
|
||||||
|
# when target_mapping is provided, there are 2-tuple of attentions
|
||||||
|
attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
|
||||||
|
else:
|
||||||
|
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
|
||||||
outputs = outputs + (attentions,)
|
outputs = outputs + (attentions,)
|
||||||
|
|
||||||
return outputs # outputs, (new_mems), (hidden_states), (attentions)
|
return outputs # outputs, (new_mems), (hidden_states), (attentions)
|
||||||
@@ -913,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -995,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
|||||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -1046,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
|||||||
|
|
||||||
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
|
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
|
||||||
|
|
||||||
|
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
|
||||||
|
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||||
|
XLNET_START_DOCSTRING,
|
||||||
|
XLNET_INPUTS_DOCSTRING)
|
||||||
|
class XLNetForTokenClassification(XLNetPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
The second dimension of the input (`num_choices`) indicates the number of choices to scores.
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for computing the multiple choice classification loss.
|
||||||
|
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||||
|
of the input tensors. (see `input_ids` above)
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Classification loss.
|
||||||
|
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||||
|
Classification scores (before SoftMax).
|
||||||
|
**mems**: (`optional`, returned when ``config.mem_len > 0``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
|
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||||
|
See details in the docstring of the `mems` input above.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||||
|
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||||
|
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, labels=labels)
|
||||||
|
scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(XLNetForTokenClassification, self).__init__(config)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.transformer = XLNetModel(config)
|
||||||
|
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
|
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
|
outputs = self.transformer(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
mems=mems,
|
||||||
|
perm_mask=perm_mask,
|
||||||
|
target_mapping=target_mapping,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
input_mask=input_mask,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
logits = self.classifier(sequence_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it
|
||||||
|
if labels is not None:
|
||||||
|
loss_fct = CrossEntropyLoss()
|
||||||
|
# Only keep active parts of the loss
|
||||||
|
if attention_mask is not None:
|
||||||
|
active_loss = attention_mask.view(-1) == 1
|
||||||
|
active_logits = logits.view(-1, self.num_labels)[active_loss]
|
||||||
|
active_labels = labels.view(-1)[active_loss]
|
||||||
|
loss = loss_fct(active_logits, active_labels)
|
||||||
|
else:
|
||||||
|
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||||
|
outputs = (loss,) + outputs
|
||||||
|
|
||||||
|
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
|
@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
|
||||||
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
|
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
|
||||||
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||||
@@ -1095,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
|||||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -1180,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
|||||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -1294,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
|||||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
|
|||||||
254
transformers/optimization_tf.py
Normal file
254
transformers/optimization_tf.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ==============================================================================
|
||||||
|
"""Functions and classes related to optimization (weight updates)."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
|
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
|
||||||
|
"""Applys a warmup schedule on a given learning rate decay schedule."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
initial_learning_rate,
|
||||||
|
decay_schedule_fn,
|
||||||
|
warmup_steps,
|
||||||
|
power=1.0,
|
||||||
|
name=None):
|
||||||
|
super(WarmUp, self).__init__()
|
||||||
|
self.initial_learning_rate = initial_learning_rate
|
||||||
|
self.warmup_steps = warmup_steps
|
||||||
|
self.power = power
|
||||||
|
self.decay_schedule_fn = decay_schedule_fn
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def __call__(self, step):
|
||||||
|
with tf.name_scope(self.name or 'WarmUp') as name:
|
||||||
|
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
|
||||||
|
# learning rate will be `global_step/num_warmup_steps * init_lr`.
|
||||||
|
global_step_float = tf.cast(step, tf.float32)
|
||||||
|
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
|
||||||
|
warmup_percent_done = global_step_float / warmup_steps_float
|
||||||
|
warmup_learning_rate = (
|
||||||
|
self.initial_learning_rate *
|
||||||
|
tf.math.pow(warmup_percent_done, self.power))
|
||||||
|
return tf.cond(global_step_float < warmup_steps_float,
|
||||||
|
lambda: warmup_learning_rate,
|
||||||
|
lambda: self.decay_schedule_fn(step),
|
||||||
|
name=name)
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
return {
|
||||||
|
'initial_learning_rate': self.initial_learning_rate,
|
||||||
|
'decay_schedule_fn': self.decay_schedule_fn,
|
||||||
|
'warmup_steps': self.warmup_steps,
|
||||||
|
'power': self.power,
|
||||||
|
'name': self.name
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
|
||||||
|
"""Creates an optimizer with learning rate schedule."""
|
||||||
|
# Implements linear decay of the learning rate.
|
||||||
|
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
|
||||||
|
initial_learning_rate=init_lr,
|
||||||
|
decay_steps=num_train_steps,
|
||||||
|
end_learning_rate=0.0)
|
||||||
|
if num_warmup_steps:
|
||||||
|
learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
|
||||||
|
decay_schedule_fn=learning_rate_fn,
|
||||||
|
warmup_steps=num_warmup_steps)
|
||||||
|
optimizer = AdamWeightDecay(
|
||||||
|
learning_rate=learning_rate_fn,
|
||||||
|
weight_decay_rate=0.01,
|
||||||
|
beta_1=0.9,
|
||||||
|
beta_2=0.999,
|
||||||
|
epsilon=1e-6,
|
||||||
|
exclude_from_weight_decay=['layer_norm', 'bias'])
|
||||||
|
return optimizer
|
||||||
|
|
||||||
|
|
||||||
|
class AdamWeightDecay(tf.keras.optimizers.Adam):
|
||||||
|
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
|
||||||
|
|
||||||
|
Just adding the square of the weights to the loss function is *not* the
|
||||||
|
correct way of using L2 regularization/weight decay with Adam, since that will
|
||||||
|
interact with the m and v parameters in strange ways.
|
||||||
|
|
||||||
|
Instead we want ot decay the weights in a manner that doesn't interact with
|
||||||
|
the m/v parameters. This is equivalent to adding the square of the weights to
|
||||||
|
the loss with plain (non-momentum) SGD.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
learning_rate=0.001,
|
||||||
|
beta_1=0.9,
|
||||||
|
beta_2=0.999,
|
||||||
|
epsilon=1e-7,
|
||||||
|
amsgrad=False,
|
||||||
|
weight_decay_rate=0.0,
|
||||||
|
include_in_weight_decay=None,
|
||||||
|
exclude_from_weight_decay=None,
|
||||||
|
name='AdamWeightDecay',
|
||||||
|
**kwargs):
|
||||||
|
super(AdamWeightDecay, self).__init__(
|
||||||
|
learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
|
||||||
|
self.weight_decay_rate = weight_decay_rate
|
||||||
|
self._include_in_weight_decay = include_in_weight_decay
|
||||||
|
self._exclude_from_weight_decay = exclude_from_weight_decay
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_config(cls, config):
|
||||||
|
"""Creates an optimizer from its config with WarmUp custom object."""
|
||||||
|
custom_objects = {'WarmUp': WarmUp}
|
||||||
|
return super(AdamWeightDecay, cls).from_config(
|
||||||
|
config, custom_objects=custom_objects)
|
||||||
|
|
||||||
|
def _prepare_local(self, var_device, var_dtype, apply_state):
|
||||||
|
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
|
||||||
|
apply_state)
|
||||||
|
apply_state['weight_decay_rate'] = tf.constant(
|
||||||
|
self.weight_decay_rate, name='adam_weight_decay_rate')
|
||||||
|
|
||||||
|
def _decay_weights_op(self, var, learning_rate, apply_state):
|
||||||
|
do_decay = self._do_use_weight_decay(var.name)
|
||||||
|
if do_decay:
|
||||||
|
return var.assign_sub(
|
||||||
|
learning_rate * var *
|
||||||
|
apply_state['weight_decay_rate'],
|
||||||
|
use_locking=self._use_locking)
|
||||||
|
return tf.no_op()
|
||||||
|
|
||||||
|
def apply_gradients(self, grads_and_vars, clip_norm, name=None):
|
||||||
|
grads, tvars = list(zip(*grads_and_vars))
|
||||||
|
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
|
||||||
|
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
|
||||||
|
|
||||||
|
def _get_lr(self, var_device, var_dtype, apply_state):
|
||||||
|
"""Retrieves the learning rate with the given state."""
|
||||||
|
if apply_state is None:
|
||||||
|
return self._decayed_lr_t[var_dtype], {}
|
||||||
|
|
||||||
|
apply_state = apply_state or {}
|
||||||
|
coefficients = apply_state.get((var_device, var_dtype))
|
||||||
|
if coefficients is None:
|
||||||
|
coefficients = self._fallback_apply_state(var_device, var_dtype)
|
||||||
|
apply_state[(var_device, var_dtype)] = coefficients
|
||||||
|
|
||||||
|
return coefficients['lr_t'], dict(apply_state=apply_state)
|
||||||
|
|
||||||
|
def _resource_apply_dense(self, grad, var, apply_state=None):
|
||||||
|
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||||
|
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||||
|
with tf.control_dependencies([decay]):
|
||||||
|
return super(AdamWeightDecay, self)._resource_apply_dense(
|
||||||
|
grad, var, **kwargs)
|
||||||
|
|
||||||
|
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
|
||||||
|
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||||
|
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||||
|
with tf.control_dependencies([decay]):
|
||||||
|
return super(AdamWeightDecay, self)._resource_apply_sparse(
|
||||||
|
grad, var, indices, **kwargs)
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
config = super(AdamWeightDecay, self).get_config()
|
||||||
|
config.update({
|
||||||
|
'weight_decay_rate': self.weight_decay_rate,
|
||||||
|
})
|
||||||
|
return config
|
||||||
|
|
||||||
|
def _do_use_weight_decay(self, param_name):
|
||||||
|
"""Whether to use L2 weight decay for `param_name`."""
|
||||||
|
if self.weight_decay_rate == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if self._include_in_weight_decay:
|
||||||
|
for r in self._include_in_weight_decay:
|
||||||
|
if re.search(r, param_name) is not None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if self._exclude_from_weight_decay:
|
||||||
|
for r in self._exclude_from_weight_decay:
|
||||||
|
if re.search(r, param_name) is not None:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
|
||||||
|
class GradientAccumulator(object):
|
||||||
|
"""Distribution strategies-aware gradient accumulation utility."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initializes the accumulator."""
|
||||||
|
self._gradients = []
|
||||||
|
self._accum_steps = tf.Variable(
|
||||||
|
initial_value=0,
|
||||||
|
dtype=tf.int64,
|
||||||
|
trainable=False,
|
||||||
|
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def step(self):
|
||||||
|
"""Number of accumulated steps."""
|
||||||
|
return self._accum_steps.value()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def gradients(self):
|
||||||
|
"""The accumulated gradients."""
|
||||||
|
return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
|
||||||
|
|
||||||
|
def __call__(self, gradients):
|
||||||
|
"""Accumulates :obj:`gradients`."""
|
||||||
|
if not self._gradients:
|
||||||
|
self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
|
||||||
|
|
||||||
|
if len(gradients) != len(self._gradients):
|
||||||
|
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
|
||||||
|
|
||||||
|
for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
|
||||||
|
if accum_gradient is not None:
|
||||||
|
accum_gradient.assign_add(gradient)
|
||||||
|
|
||||||
|
self._accum_steps.assign_add(1)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets the accumulated gradients."""
|
||||||
|
if self._gradients:
|
||||||
|
self._accum_steps.assign(0)
|
||||||
|
|
||||||
|
for gradient in self._get_replica_gradients():
|
||||||
|
if gradient is not None:
|
||||||
|
gradient.assign(tf.zeros_like(gradient))
|
||||||
|
|
||||||
|
def _get_replica_gradients(self):
|
||||||
|
if tf.distribute.has_strategy():
|
||||||
|
# In a replica context, we want to accumulate gradients on each replica
|
||||||
|
# without synchronization, so we directly assign the value of the
|
||||||
|
# current replica.
|
||||||
|
replica_context = tf.distribute.get_replica_context()
|
||||||
|
|
||||||
|
if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
|
||||||
|
return self._gradients
|
||||||
|
|
||||||
|
return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
|
||||||
|
else:
|
||||||
|
return self._gradients
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
# content of conftest.py
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--runslow", action="store_true", default=False, help="run slow tests"
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--use_cuda", action="store_true", default=False, help="run tests on gpu"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_configure(config):
|
|
||||||
config.addinivalue_line("markers", "slow: mark test as slow to run")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_collection_modifyitems(config, items):
|
|
||||||
if config.getoption("--runslow"):
|
|
||||||
# --runslow given in cli: do not skip slow tests
|
|
||||||
return
|
|
||||||
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
|
|
||||||
for item in items:
|
|
||||||
if "slow" in item.keywords:
|
|
||||||
item.add_marker(skip_slow)
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def use_cuda(request):
|
|
||||||
""" Run test on gpu """
|
|
||||||
return request.config.getoption("--use_cuda")
|
|
||||||
BIN
transformers/tests/fixtures/spiece.model
vendored
Normal file
BIN
transformers/tests/fixtures/spiece.model
vendored
Normal file
Binary file not shown.
102
transformers/tests/hf_api_test.py
Normal file
102
transformers/tests/hf_api_test.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
import six
|
||||||
|
import time
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
|
||||||
|
|
||||||
|
USER = "__DUMMY_TRANSFORMERS_USER__"
|
||||||
|
PASS = "__DUMMY_TRANSFORMERS_PASS__"
|
||||||
|
FILE_KEY = "Test-{}.txt".format(int(time.time()))
|
||||||
|
FILE_PATH = os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class HfApiCommonTest(unittest.TestCase):
|
||||||
|
_api = HfApi(endpoint="https://moon-staging.huggingface.co")
|
||||||
|
|
||||||
|
|
||||||
|
class HfApiLoginTest(HfApiCommonTest):
|
||||||
|
def test_login_invalid(self):
|
||||||
|
with self.assertRaises(HTTPError):
|
||||||
|
self._api.login(username=USER, password="fake")
|
||||||
|
|
||||||
|
def test_login_valid(self):
|
||||||
|
token = self._api.login(username=USER, password=PASS)
|
||||||
|
self.assertIsInstance(token, six.string_types)
|
||||||
|
|
||||||
|
|
||||||
|
class HfApiEndpointsTest(HfApiCommonTest):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
"""
|
||||||
|
Share this valid token in all tests below.
|
||||||
|
"""
|
||||||
|
cls._token = cls._api.login(username=USER, password=PASS)
|
||||||
|
|
||||||
|
def test_whoami(self):
|
||||||
|
user = self._api.whoami(token=self._token)
|
||||||
|
self.assertEqual(user, USER)
|
||||||
|
|
||||||
|
def test_presign(self):
|
||||||
|
urls = self._api.presign(token=self._token, filename=FILE_KEY)
|
||||||
|
self.assertIsInstance(urls, PresignedUrl)
|
||||||
|
self.assertEqual(urls.type, "text/plain")
|
||||||
|
|
||||||
|
def test_presign_and_upload(self):
|
||||||
|
access_url = self._api.presign_and_upload(
|
||||||
|
token=self._token, filename=FILE_KEY, filepath=FILE_PATH
|
||||||
|
)
|
||||||
|
self.assertIsInstance(access_url, six.string_types)
|
||||||
|
|
||||||
|
def test_list_objs(self):
|
||||||
|
objs = self._api.list_objs(token=self._token)
|
||||||
|
self.assertIsInstance(objs, list)
|
||||||
|
if len(objs) > 0:
|
||||||
|
o = objs[-1]
|
||||||
|
self.assertIsInstance(o, S3Obj)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class HfFolderTest(unittest.TestCase):
|
||||||
|
def test_token_workflow(self):
|
||||||
|
"""
|
||||||
|
Test the whole token save/get/delete workflow,
|
||||||
|
with the desired behavior with respect to non-existent tokens.
|
||||||
|
"""
|
||||||
|
token = "token-{}".format(int(time.time()))
|
||||||
|
HfFolder.save_token(token)
|
||||||
|
self.assertEqual(
|
||||||
|
HfFolder.get_token(),
|
||||||
|
token
|
||||||
|
)
|
||||||
|
HfFolder.delete_token()
|
||||||
|
HfFolder.delete_token()
|
||||||
|
# ^^ not an error, we test that the
|
||||||
|
# second call does not fail.
|
||||||
|
self.assertEqual(
|
||||||
|
HfFolder.get_token(),
|
||||||
|
None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
240
transformers/tests/modeling_albert_test.py
Normal file
240
transformers/tests/modeling_albert_test.py
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from transformers import is_torch_available
|
||||||
|
|
||||||
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
|
||||||
|
AlbertForSequenceClassification, AlbertForQuestionAnswering,
|
||||||
|
)
|
||||||
|
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
|
all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
|
||||||
|
|
||||||
|
class AlbertModelTester(object):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_input_mask=True,
|
||||||
|
use_token_type_ids=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
embedding_size=16,
|
||||||
|
hidden_size=36,
|
||||||
|
num_hidden_layers=6,
|
||||||
|
num_hidden_groups=6,
|
||||||
|
num_attention_heads=6,
|
||||||
|
intermediate_size=37,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=16,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
num_labels=3,
|
||||||
|
num_choices=4,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.seq_length = seq_length
|
||||||
|
self.is_training = is_training
|
||||||
|
self.use_input_mask = use_input_mask
|
||||||
|
self.use_token_type_ids = use_token_type_ids
|
||||||
|
self.use_labels = use_labels
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.embedding_size = embedding_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.type_vocab_size = type_vocab_size
|
||||||
|
self.type_sequence_label_size = type_sequence_label_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.num_choices = num_choices
|
||||||
|
self.scope = scope
|
||||||
|
self.num_hidden_groups = num_hidden_groups
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = AlbertConfig(
|
||||||
|
vocab_size_or_config_json_file=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
num_hidden_groups=self.num_hidden_groups)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
|
||||||
|
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
|
model = AlbertModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
"pooled_output": pooled_output,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.hidden_size])
|
||||||
|
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
|
||||||
|
|
||||||
|
|
||||||
|
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
|
model = AlbertForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"prediction_scores": prediction_scores,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
|
model = AlbertForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
|
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"start_logits": start_logits,
|
||||||
|
"end_logits": end_logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["start_logits"].size()),
|
||||||
|
[self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["end_logits"].size()),
|
||||||
|
[self.batch_size, self.seq_length])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
|
||||||
|
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = AlbertForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["logits"].size()),
|
||||||
|
[self.batch_size, self.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, token_type_ids, input_mask,
|
||||||
|
sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||||
|
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.model_tester = AlbertModelTest.AlbertModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
|
def test_albert_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_albert_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_for_masked_lm(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_for_question_answering(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_for_sequence_classification(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/transformers_test/"
|
||||||
|
for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -18,11 +18,12 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
|
from .utils import require_torch, slow
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (AutoConfig, BertConfig,
|
from transformers import (AutoConfig, BertConfig,
|
||||||
AutoModel, BertModel,
|
AutoModel, BertModel,
|
||||||
@@ -33,12 +34,11 @@ if is_torch_available():
|
|||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class AutoModelTest(unittest.TestCase):
|
class AutoModelTest(unittest.TestCase):
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
|
|||||||
for value in loading_info.values():
|
for value in loading_info.values():
|
||||||
self.assertEqual(len(value), 0)
|
self.assertEqual(len(value), 0)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_lmhead_model_from_pretrained(self):
|
def test_lmhead_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
|
|||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
self.assertIsInstance(model, BertForMaskedLM)
|
self.assertIsInstance(model, BertForMaskedLM)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_sequence_classification_model_from_pretrained(self):
|
def test_sequence_classification_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
|
|||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
self.assertIsInstance(model, BertForSequenceClassification)
|
self.assertIsInstance(model, BertForSequenceClassification)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_question_answering_model_from_pretrained(self):
|
def test_question_answering_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -18,12 +18,12 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (BertConfig, BertModel, BertForMaskedLM,
|
from transformers import (BertConfig, BertModel, BertForMaskedLM,
|
||||||
@@ -31,11 +31,9 @@ if is_torch_available():
|
|||||||
BertForQuestionAnswering, BertForSequenceClassification,
|
BertForQuestionAnswering, BertForSequenceClassification,
|
||||||
BertForTokenClassification, BertForMultipleChoice)
|
BertForTokenClassification, BertForMultipleChoice)
|
||||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("use_cuda")
|
@require_torch
|
||||||
class BertModelTest(CommonTestCases.CommonModelTester):
|
class BertModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
|
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
|
||||||
@@ -67,7 +65,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
num_labels=3,
|
num_labels=3,
|
||||||
num_choices=4,
|
num_choices=4,
|
||||||
scope=None,
|
scope=None,
|
||||||
device='cpu',
|
|
||||||
):
|
):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@@ -91,26 +88,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
self.num_labels = num_labels
|
self.num_labels = num_labels
|
||||||
self.num_choices = num_choices
|
self.num_choices = num_choices
|
||||||
self.scope = scope
|
self.scope = scope
|
||||||
self.device = device
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(self.device)
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
input_mask = None
|
input_mask = None
|
||||||
if self.use_input_mask:
|
if self.use_input_mask:
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(self.device)
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
token_type_ids = None
|
token_type_ids = None
|
||||||
if self.use_token_type_ids:
|
if self.use_token_type_ids:
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size).to(self.device)
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
sequence_labels = None
|
sequence_labels = None
|
||||||
token_labels = None
|
token_labels = None
|
||||||
choice_labels = None
|
choice_labels = None
|
||||||
if self.use_labels:
|
if self.use_labels:
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(self.device)
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(self.device)
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices).to(self.device)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = BertConfig(
|
config = BertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size_or_config_json_file=self.vocab_size,
|
||||||
@@ -144,7 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertModel(config=config)
|
model = BertModel(config=config)
|
||||||
model.to(input_ids.device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
@@ -161,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
||||||
model = BertModel(config)
|
model = BertModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
|
||||||
@@ -177,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForMaskedLM(config=config)
|
model = BertForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -190,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
||||||
model = BertForMaskedLM(config=config)
|
model = BertForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
||||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
|
||||||
@@ -204,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForNextSentencePrediction(config=config)
|
model = BertForNextSentencePrediction(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
|
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -217,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForPreTraining(config=config)
|
model = BertForPreTraining(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
|
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
|
||||||
@@ -235,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForQuestionAnswering(config=config)
|
model = BertForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
@@ -254,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = BertForSequenceClassification(config)
|
model = BertForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -268,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = BertForTokenClassification(config=config)
|
model = BertForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -282,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_choices = self.num_choices
|
config.num_choices = self.num_choices
|
||||||
model = BertForMultipleChoice(config=config)
|
model = BertForMultipleChoice(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
@@ -313,10 +318,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def test_config(self):
|
def test_config(self):
|
||||||
self.config_tester.run_common_tests()
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
def test_bert_model(self, use_cuda=False):
|
def test_bert_model(self):
|
||||||
# ^^ This could be a real fixture
|
|
||||||
if use_cuda:
|
|
||||||
self.model_tester.device = "cuda"
|
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_bert_model(*config_and_inputs)
|
self.model_tester.create_and_check_bert_model(*config_and_inputs)
|
||||||
|
|
||||||
@@ -356,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -27,10 +27,11 @@ import uuid
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import logging
|
import logging
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -38,8 +39,6 @@ if is_torch_available():
|
|||||||
from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
|
from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
|
||||||
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
if sys.version_info[0] == 2:
|
if sys.version_info[0] == 2:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
@@ -65,6 +64,7 @@ def _config_zero_init(config):
|
|||||||
|
|
||||||
class CommonTestCases:
|
class CommonTestCases:
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class CommonModelTester(unittest.TestCase):
|
class CommonModelTester(unittest.TestCase):
|
||||||
|
|
||||||
model_tester = None
|
model_tester = None
|
||||||
@@ -79,6 +79,7 @@ class CommonTestCases:
|
|||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
@@ -86,12 +87,13 @@ class CommonTestCases:
|
|||||||
with TemporaryDirectory() as tmpdirname:
|
with TemporaryDirectory() as tmpdirname:
|
||||||
model.save_pretrained(tmpdirname)
|
model.save_pretrained(tmpdirname)
|
||||||
model = model_class.from_pretrained(tmpdirname)
|
model = model_class.from_pretrained(tmpdirname)
|
||||||
|
model.to(torch_device)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
after_outputs = model(**inputs_dict)
|
after_outputs = model(**inputs_dict)
|
||||||
|
|
||||||
# Make sure we don't have nans
|
# Make sure we don't have nans
|
||||||
out_1 = after_outputs[0].numpy()
|
out_1 = after_outputs[0].cpu().numpy()
|
||||||
out_2 = outputs[0].numpy()
|
out_2 = outputs[0].cpu().numpy()
|
||||||
out_1 = out_1[~np.isnan(out_1)]
|
out_1 = out_1[~np.isnan(out_1)]
|
||||||
out_2 = out_2[~np.isnan(out_2)]
|
out_2 = out_2[~np.isnan(out_2)]
|
||||||
max_diff = np.amax(np.abs(out_1 - out_2))
|
max_diff = np.amax(np.abs(out_1 - out_2))
|
||||||
@@ -113,6 +115,7 @@ class CommonTestCases:
|
|||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
|
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
|
||||||
self.assertEqual(first.ne(second).sum().item(), 0)
|
self.assertEqual(first.ne(second).sum().item(), 0)
|
||||||
@@ -125,6 +128,7 @@ class CommonTestCases:
|
|||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = False
|
config.output_hidden_states = False
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
@@ -142,6 +146,7 @@ class CommonTestCases:
|
|||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = True
|
config.output_hidden_states = True
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
self.assertEqual(out_len+1, len(outputs))
|
self.assertEqual(out_len+1, len(outputs))
|
||||||
@@ -181,6 +186,7 @@ class CommonTestCases:
|
|||||||
configs_no_init.torchscript = True
|
configs_no_init.torchscript = True
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=configs_no_init)
|
model = model_class(config=configs_no_init)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
inputs = inputs_dict['input_ids'] # Let's keep only input_ids
|
inputs = inputs_dict['input_ids'] # Let's keep only input_ids
|
||||||
|
|
||||||
@@ -201,7 +207,10 @@ class CommonTestCases:
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
self.fail("Couldn't load module.")
|
self.fail("Couldn't load module.")
|
||||||
|
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
|
loaded_model.to(torch_device)
|
||||||
loaded_model.eval()
|
loaded_model.eval()
|
||||||
|
|
||||||
model_params = model.parameters()
|
model_params = model.parameters()
|
||||||
@@ -228,11 +237,12 @@ class CommonTestCases:
|
|||||||
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config=configs_no_init)
|
model = model_class(config=configs_no_init)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
# Prepare head_mask
|
# Prepare head_mask
|
||||||
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
||||||
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
|
||||||
head_mask[0, 0] = 0
|
head_mask[0, 0] = 0
|
||||||
head_mask[-1, :-1] = 0
|
head_mask[-1, :-1] = 0
|
||||||
head_mask.requires_grad_(requires_grad=True)
|
head_mask.requires_grad_(requires_grad=True)
|
||||||
@@ -282,6 +292,7 @@ class CommonTestCases:
|
|||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = False
|
config.output_hidden_states = False
|
||||||
model = model_class(config=config)
|
model = model_class(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||||
-1: [0]}
|
-1: [0]}
|
||||||
@@ -310,6 +321,7 @@ class CommonTestCases:
|
|||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = False
|
config.output_hidden_states = False
|
||||||
model = model_class(config=config)
|
model = model_class(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||||
-1: [0]}
|
-1: [0]}
|
||||||
@@ -319,6 +331,7 @@ class CommonTestCases:
|
|||||||
os.makedirs(directory)
|
os.makedirs(directory)
|
||||||
model.save_pretrained(directory)
|
model.save_pretrained(directory)
|
||||||
model = model_class.from_pretrained(directory)
|
model = model_class.from_pretrained(directory)
|
||||||
|
model.to(torch_device)
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
@@ -346,6 +359,7 @@ class CommonTestCases:
|
|||||||
config.pruned_heads = heads_to_prune
|
config.pruned_heads = heads_to_prune
|
||||||
|
|
||||||
model = model_class(config=config)
|
model = model_class(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
@@ -372,6 +386,7 @@ class CommonTestCases:
|
|||||||
config.pruned_heads = heads_to_prune
|
config.pruned_heads = heads_to_prune
|
||||||
|
|
||||||
model = model_class(config=config)
|
model = model_class(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
@@ -388,6 +403,7 @@ class CommonTestCases:
|
|||||||
os.makedirs(directory)
|
os.makedirs(directory)
|
||||||
model.save_pretrained(directory)
|
model.save_pretrained(directory)
|
||||||
model = model_class.from_pretrained(directory)
|
model = model_class.from_pretrained(directory)
|
||||||
|
model.to(torch_device)
|
||||||
shutil.rmtree(directory)
|
shutil.rmtree(directory)
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
@@ -419,6 +435,7 @@ class CommonTestCases:
|
|||||||
config.output_hidden_states = True
|
config.output_hidden_states = True
|
||||||
config.output_attentions = False
|
config.output_attentions = False
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
hidden_states = outputs[-1]
|
hidden_states = outputs[-1]
|
||||||
@@ -538,6 +555,7 @@ class CommonTestCases:
|
|||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
wte = model.get_input_embeddings()
|
wte = model.get_input_embeddings()
|
||||||
@@ -628,6 +646,7 @@ class CommonTestCases:
|
|||||||
def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
|
def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
|
||||||
mc_labels, lm_labels, mc_token_ids):
|
mc_labels, lm_labels, mc_token_ids):
|
||||||
model = self.base_model_class(config)
|
model = self.base_model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
outputs = model(input_ids, position_ids, token_type_ids)
|
outputs = model(input_ids, position_ids, token_type_ids)
|
||||||
@@ -643,6 +662,7 @@ class CommonTestCases:
|
|||||||
def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
|
def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
|
||||||
mc_labels, lm_labels, mc_token_ids):
|
mc_labels, lm_labels, mc_token_ids):
|
||||||
model = self.lm_head_model_class(config)
|
model = self.lm_head_model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||||
loss, lm_logits = outputs[:2]
|
loss, lm_logits = outputs[:2]
|
||||||
@@ -659,6 +679,7 @@ class CommonTestCases:
|
|||||||
mc_labels, lm_labels, mc_token_ids):
|
mc_labels, lm_labels, mc_token_ids):
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
presents = outputs[-1]
|
presents = outputs[-1]
|
||||||
@@ -671,6 +692,7 @@ class CommonTestCases:
|
|||||||
def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
|
def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
|
||||||
mc_labels, lm_labels, mc_token_ids):
|
mc_labels, lm_labels, mc_token_ids):
|
||||||
model = self.double_head_model_class(config)
|
model = self.double_head_model_class(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
|
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
|
||||||
token_type_ids=token_type_ids, position_ids=position_ids)
|
token_type_ids=token_type_ids, position_ids=position_ids)
|
||||||
@@ -716,7 +738,7 @@ class CommonTestCases:
|
|||||||
config_and_inputs = self.prepare_config_and_inputs()
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
self.create_and_check_presents(*config_and_inputs)
|
self.create_and_check_presents(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def run_slow_tests(self):
|
def run_slow_tests(self):
|
||||||
self.create_and_check_model_from_pretrained()
|
self.create_and_check_model_from_pretrained()
|
||||||
|
|
||||||
@@ -770,7 +792,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
|
|||||||
for _ in range(total_dims):
|
for _ in range(total_dims):
|
||||||
values.append(rng.randint(0, vocab_size - 1))
|
values.append(rng.randint(0, vocab_size - 1))
|
||||||
|
|
||||||
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
|
return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
|
||||||
|
|
||||||
|
|
||||||
def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
||||||
@@ -786,11 +808,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
|||||||
for _ in range(total_dims):
|
for _ in range(total_dims):
|
||||||
values.append(rng.random() * scale)
|
values.append(rng.random() * scale)
|
||||||
|
|
||||||
return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
|
return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class ModelUtilsTest(unittest.TestCase):
|
class ModelUtilsTest(unittest.TestCase):
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
|
||||||
import shutil
|
import shutil
|
||||||
import pdb
|
import pdb
|
||||||
|
|
||||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
|||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
CTRLLMHeadModel)
|
CTRLLMHeadModel)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class CTRLModelTest(CommonTestCases.CommonModelTester):
|
class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
|
all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
|
||||||
@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
model = CTRLModel(config=config)
|
model = CTRLModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
model = CTRLLMHeadModel(config)
|
model = CTRLLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
@@ -25,13 +24,13 @@ if is_torch_available():
|
|||||||
from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
|
from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
|
||||||
DistilBertForTokenClassification,
|
DistilBertForTokenClassification,
|
||||||
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
|
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
|
all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
|
||||||
@@ -126,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = DistilBertModel(config=config)
|
model = DistilBertModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
(sequence_output,) = model(input_ids, input_mask)
|
(sequence_output,) = model(input_ids, input_mask)
|
||||||
(sequence_output,) = model(input_ids)
|
(sequence_output,) = model(input_ids)
|
||||||
@@ -139,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = DistilBertForMaskedLM(config=config)
|
model = DistilBertForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -152,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = DistilBertForQuestionAnswering(config=config)
|
model = DistilBertForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -170,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = DistilBertForSequenceClassification(config)
|
model = DistilBertForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -184,6 +187,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = DistilBertForTokenClassification(config=config)
|
model = DistilBertForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
|
||||||
@@ -229,7 +233,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
|
self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
# @pytest.mark.slow
|
# @slow
|
||||||
# def test_model_from_pretrained(self):
|
# def test_model_from_pretrained(self):
|
||||||
# cache_dir = "/tmp/transformers_test/"
|
# cache_dir = "/tmp/transformers_test/"
|
||||||
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -15,19 +15,18 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
from .utils import require_torch, slow
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import BertModel, BertForMaskedLM, Model2Model
|
from transformers import BertModel, BertForMaskedLM, Model2Model
|
||||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class EncoderDecoderModelTest(unittest.TestCase):
|
class EncoderDecoderModelTest(unittest.TestCase):
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model2model_from_pretrained(self):
|
def test_model2model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
|||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
|
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
|
||||||
@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
model = GPT2Model(config=config)
|
model = GPT2Model(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
model = GPT2LMHeadModel(config)
|
model = GPT2LMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
|
def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
|
||||||
model = GPT2DoubleHeadsModel(config)
|
model = GPT2DoubleHeadsModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
|
|
||||||
@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
|||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
|
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
|
||||||
@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
model = OpenAIGPTModel(config=config)
|
model = OpenAIGPTModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
model = OpenAIGPTLMHeadModel(config)
|
model = OpenAIGPTLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
model = OpenAIGPTDoubleHeadsModel(config)
|
model = OpenAIGPTDoubleHeadsModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
||||||
@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
@@ -27,13 +26,13 @@ if is_torch_available():
|
|||||||
from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
|
from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
|
||||||
RobertaForSequenceClassification, RobertaForTokenClassification)
|
RobertaForSequenceClassification, RobertaForTokenClassification)
|
||||||
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class RobertaModelTest(CommonTestCases.CommonModelTester):
|
class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
|
all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
|
||||||
@@ -129,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
||||||
token_labels, choice_labels):
|
token_labels, choice_labels):
|
||||||
model = RobertaModel(config=config)
|
model = RobertaModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
@@ -146,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
||||||
token_labels, choice_labels):
|
token_labels, choice_labels):
|
||||||
model = RobertaForMaskedLM(config=config)
|
model = RobertaForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -161,6 +162,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
sequence_labels, token_labels, choice_labels):
|
sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = RobertaForTokenClassification(config=config)
|
model = RobertaForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
labels=token_labels)
|
labels=token_labels)
|
||||||
@@ -195,7 +197,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
@@ -207,10 +209,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
class RobertaModelIntegrationTest(unittest.TestCase):
|
class RobertaModelIntegrationTest(unittest.TestCase):
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_inference_masked_lm(self):
|
def test_inference_masked_lm(self):
|
||||||
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
||||||
|
|
||||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
output = model(input_ids)[0]
|
output = model(input_ids)[0]
|
||||||
expected_shape = torch.Size((1, 11, 50265))
|
expected_shape = torch.Size((1, 11, 50265))
|
||||||
@@ -228,10 +230,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
|
|||||||
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_inference_no_head(self):
|
def test_inference_no_head(self):
|
||||||
model = RobertaModel.from_pretrained('roberta-base')
|
model = RobertaModel.from_pretrained('roberta-base')
|
||||||
|
|
||||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
output = model(input_ids)[0]
|
output = model(input_ids)[0]
|
||||||
# compare the actual values for a slice.
|
# compare the actual values for a slice.
|
||||||
@@ -244,10 +246,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
|
|||||||
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_inference_classification_head(self):
|
def test_inference_classification_head(self):
|
||||||
model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
||||||
|
|
||||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
output = model(input_ids)[0]
|
output = model(input_ids)[0]
|
||||||
expected_shape = torch.Size((1, 3))
|
expected_shape = torch.Size((1, 3))
|
||||||
|
|||||||
230
transformers/tests/modeling_tf_albert_test.py
Normal file
230
transformers/tests/modeling_tf_albert_test.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
|
from transformers import AlbertConfig, is_tf_available
|
||||||
|
|
||||||
|
if is_tf_available():
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
|
||||||
|
TFAlbertForSequenceClassification,
|
||||||
|
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
|
all_model_classes = (
|
||||||
|
TFAlbertModel,
|
||||||
|
TFAlbertForMaskedLM,
|
||||||
|
TFAlbertForSequenceClassification
|
||||||
|
) if is_tf_available() else ()
|
||||||
|
|
||||||
|
class TFAlbertModelTester(object):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_input_mask=True,
|
||||||
|
use_token_type_ids=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
embedding_size=16,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=37,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=16,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
num_labels=3,
|
||||||
|
num_choices=4,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.seq_length = seq_length
|
||||||
|
self.is_training = is_training
|
||||||
|
self.use_input_mask = use_input_mask
|
||||||
|
self.use_token_type_ids = use_token_type_ids
|
||||||
|
self.use_labels = use_labels
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.embedding_size = embedding_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.type_vocab_size = type_vocab_size
|
||||||
|
self.type_sequence_label_size = type_sequence_label_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.num_choices = num_choices
|
||||||
|
self.scope = scope
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor(
|
||||||
|
[self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor(
|
||||||
|
[self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor(
|
||||||
|
[self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor(
|
||||||
|
[self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor(
|
||||||
|
[self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = AlbertConfig(
|
||||||
|
vocab_size_or_config_json_file=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
|
model = TFAlbertModel(config=config)
|
||||||
|
# inputs = {'input_ids': input_ids,
|
||||||
|
# 'attention_mask': input_mask,
|
||||||
|
# 'token_type_ids': token_type_ids}
|
||||||
|
# sequence_output, pooled_output = model(**inputs)
|
||||||
|
inputs = {'input_ids': input_ids,
|
||||||
|
'attention_mask': input_mask,
|
||||||
|
'token_type_ids': token_type_ids}
|
||||||
|
sequence_output, pooled_output = model(inputs)
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
sequence_output, pooled_output = model(inputs)
|
||||||
|
|
||||||
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
"pooled_output": pooled_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape),
|
||||||
|
[self.batch_size, self.seq_length, self.hidden_size])
|
||||||
|
self.parent.assertListEqual(list(result["pooled_output"].shape), [
|
||||||
|
self.batch_size, self.hidden_size])
|
||||||
|
|
||||||
|
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
|
model = TFAlbertForMaskedLM(config=config)
|
||||||
|
inputs = {'input_ids': input_ids,
|
||||||
|
'attention_mask': input_mask,
|
||||||
|
'token_type_ids': token_type_ids}
|
||||||
|
prediction_scores, = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape),
|
||||||
|
[self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFAlbertForSequenceClassification(config=config)
|
||||||
|
inputs = {'input_ids': input_ids,
|
||||||
|
'attention_mask': input_mask,
|
||||||
|
'token_type_ids': token_type_ids}
|
||||||
|
logits, = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["logits"].shape),
|
||||||
|
[self.batch_size, self.num_labels])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, token_type_ids, input_mask,
|
||||||
|
sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||||
|
inputs_dict = {'input_ids': input_ids,
|
||||||
|
'token_type_ids': token_type_ids, 'attention_mask': input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(
|
||||||
|
self, config_class=AlbertConfig, hidden_size=37)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
|
def test_albert_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_albert_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_for_masked_lm(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_albert_for_masked_lm(
|
||||||
|
*config_and_inputs)
|
||||||
|
|
||||||
|
def test_for_sequence_classification(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_albert_for_sequence_classification(
|
||||||
|
*config_and_inputs)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/transformers_test/"
|
||||||
|
# for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
for model_name in ['albert-base-uncased']:
|
||||||
|
model = TFAlbertModel.from_pretrained(
|
||||||
|
model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -18,11 +18,12 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from transformers import is_tf_available
|
from transformers import is_tf_available
|
||||||
|
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
from transformers import (AutoConfig, BertConfig,
|
from transformers import (AutoConfig, BertConfig,
|
||||||
TFAutoModel, TFBertModel,
|
TFAutoModel, TFBertModel,
|
||||||
@@ -33,11 +34,11 @@ if is_tf_available():
|
|||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFAutoModelTest(unittest.TestCase):
|
class TFAutoModelTest(unittest.TestCase):
|
||||||
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
import h5py
|
import h5py
|
||||||
self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
|
self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
|
||||||
@@ -53,6 +54,7 @@ class TFAutoModelTest(unittest.TestCase):
|
|||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
self.assertIsInstance(model, TFBertModel)
|
self.assertIsInstance(model, TFBertModel)
|
||||||
|
|
||||||
|
@slow
|
||||||
def test_lmhead_model_from_pretrained(self):
|
def test_lmhead_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
@@ -65,6 +67,7 @@ class TFAutoModelTest(unittest.TestCase):
|
|||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
self.assertIsInstance(model, TFBertForMaskedLM)
|
self.assertIsInstance(model, TFBertForMaskedLM)
|
||||||
|
|
||||||
|
@slow
|
||||||
def test_sequence_classification_model_from_pretrained(self):
|
def test_sequence_classification_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
@@ -77,6 +80,7 @@ class TFAutoModelTest(unittest.TestCase):
|
|||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
self.assertIsInstance(model, TFBertForSequenceClassification)
|
self.assertIsInstance(model, TFBertForSequenceClassification)
|
||||||
|
|
||||||
|
@slow
|
||||||
def test_question_answering_model_from_pretrained(self):
|
def test_question_answering_model_from_pretrained(self):
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import BertConfig, is_tf_available
|
from transformers import BertConfig, is_tf_available
|
||||||
|
|
||||||
@@ -36,10 +36,9 @@ if is_tf_available():
|
|||||||
TFBertForTokenClassification,
|
TFBertForTokenClassification,
|
||||||
TFBertForQuestionAnswering,
|
TFBertForQuestionAnswering,
|
||||||
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
|
all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
|
||||||
@@ -309,7 +308,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -25,18 +25,17 @@ import unittest
|
|||||||
import uuid
|
import uuid
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
import pytest
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from transformers import is_tf_available, is_torch_available
|
from transformers import is_tf_available, is_torch_available
|
||||||
|
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from transformers import TFPreTrainedModel
|
from transformers import TFPreTrainedModel
|
||||||
# from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
# from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
if sys.version_info[0] == 2:
|
if sys.version_info[0] == 2:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
@@ -62,6 +61,7 @@ def _config_zero_init(config):
|
|||||||
|
|
||||||
class TFCommonTestCases:
|
class TFCommonTestCases:
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFCommonModelTester(unittest.TestCase):
|
class TFCommonModelTester(unittest.TestCase):
|
||||||
|
|
||||||
model_tester = None
|
model_tester = None
|
||||||
@@ -164,7 +164,7 @@ class TFCommonTestCases:
|
|||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
# Prepare our model
|
# Prepare our model
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
|
||||||
# Let's load it from the disk to be sure we can use pretrained weights
|
# Let's load it from the disk to be sure we can use pretrained weights
|
||||||
with TemporaryDirectory() as tmpdirname:
|
with TemporaryDirectory() as tmpdirname:
|
||||||
outputs = model(inputs_dict) # build the model
|
outputs = model(inputs_dict) # build the model
|
||||||
@@ -233,80 +233,6 @@ class TFCommonTestCases:
|
|||||||
self.model_tester.seq_length,
|
self.model_tester.seq_length,
|
||||||
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
||||||
|
|
||||||
def test_headmasking(self):
|
|
||||||
pass
|
|
||||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
||||||
|
|
||||||
# config.output_attentions = True
|
|
||||||
# config.output_hidden_states = True
|
|
||||||
# configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
|
||||||
# for model_class in self.all_model_classes:
|
|
||||||
# model = model_class(config=configs_no_init)
|
|
||||||
# model.eval()
|
|
||||||
|
|
||||||
# # Prepare head_mask
|
|
||||||
# # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
|
||||||
# head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
|
||||||
# head_mask[0, 0] = 0
|
|
||||||
# head_mask[-1, :-1] = 0
|
|
||||||
# head_mask.requires_grad_(requires_grad=True)
|
|
||||||
# inputs = inputs_dict.copy()
|
|
||||||
# inputs['head_mask'] = head_mask
|
|
||||||
|
|
||||||
# outputs = model(**inputs)
|
|
||||||
|
|
||||||
# # Test that we can get a gradient back for importance score computation
|
|
||||||
# output = sum(t.sum() for t in outputs[0])
|
|
||||||
# output = output.sum()
|
|
||||||
# output.backward()
|
|
||||||
# multihead_outputs = head_mask.grad
|
|
||||||
|
|
||||||
# attentions = outputs[-1]
|
|
||||||
# hidden_states = outputs[-2]
|
|
||||||
|
|
||||||
# # Remove Nan
|
|
||||||
|
|
||||||
# self.assertIsNotNone(multihead_outputs)
|
|
||||||
# self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
|
|
||||||
# self.assertAlmostEqual(
|
|
||||||
# attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
|
|
||||||
# self.assertNotEqual(
|
|
||||||
# attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
|
|
||||||
# self.assertNotEqual(
|
|
||||||
# attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
|
|
||||||
# self.assertAlmostEqual(
|
|
||||||
# attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
|
|
||||||
# self.assertNotEqual(
|
|
||||||
# attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
|
|
||||||
|
|
||||||
|
|
||||||
def test_head_pruning(self):
|
|
||||||
pass
|
|
||||||
# if not self.test_pruning:
|
|
||||||
# return
|
|
||||||
|
|
||||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
||||||
|
|
||||||
# for model_class in self.all_model_classes:
|
|
||||||
# config.output_attentions = True
|
|
||||||
# config.output_hidden_states = False
|
|
||||||
# model = model_class(config=config)
|
|
||||||
# model.eval()
|
|
||||||
# heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
|
||||||
# -1: [0]}
|
|
||||||
# model.prune_heads(heads_to_prune)
|
|
||||||
# outputs = model(**inputs_dict)
|
|
||||||
|
|
||||||
# attentions = outputs[-1]
|
|
||||||
|
|
||||||
# self.assertEqual(
|
|
||||||
# attentions[0].shape[-3], 1)
|
|
||||||
# self.assertEqual(
|
|
||||||
# attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
|
||||||
# self.assertEqual(
|
|
||||||
# attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
|
||||||
|
|
||||||
|
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
@@ -323,43 +249,6 @@ class TFCommonTestCases:
|
|||||||
list(hidden_states[0].shape[-2:]),
|
list(hidden_states[0].shape[-2:]),
|
||||||
[self.model_tester.seq_length, self.model_tester.hidden_size])
|
[self.model_tester.seq_length, self.model_tester.hidden_size])
|
||||||
|
|
||||||
|
|
||||||
def test_resize_tokens_embeddings(self):
|
|
||||||
pass
|
|
||||||
# original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
||||||
# if not self.test_resize_embeddings:
|
|
||||||
# return
|
|
||||||
|
|
||||||
# for model_class in self.all_model_classes:
|
|
||||||
# config = copy.deepcopy(original_config)
|
|
||||||
# model = model_class(config)
|
|
||||||
|
|
||||||
# model_vocab_size = config.vocab_size
|
|
||||||
# # Retrieve the embeddings and clone theme
|
|
||||||
# model_embed = model.resize_token_embeddings(model_vocab_size)
|
|
||||||
# cloned_embeddings = model_embed.weight.clone()
|
|
||||||
|
|
||||||
# # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
|
|
||||||
# model_embed = model.resize_token_embeddings(model_vocab_size + 10)
|
|
||||||
# self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
|
|
||||||
# # Check that it actually resizes the embeddings matrix
|
|
||||||
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
|
|
||||||
|
|
||||||
# # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
|
|
||||||
# model_embed = model.resize_token_embeddings(model_vocab_size - 15)
|
|
||||||
# self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
|
|
||||||
# # Check that it actually resizes the embeddings matrix
|
|
||||||
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
|
|
||||||
|
|
||||||
# # Check that adding and removing tokens has not modified the first part of the embedding matrix.
|
|
||||||
# models_equal = True
|
|
||||||
# for p1, p2 in zip(cloned_embeddings, model_embed.weight):
|
|
||||||
# if p1.data.ne(p2.data).sum() > 0:
|
|
||||||
# models_equal = False
|
|
||||||
|
|
||||||
# self.assertTrue(models_equal)
|
|
||||||
|
|
||||||
|
|
||||||
def test_model_common_attributes(self):
|
def test_model_common_attributes(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
@@ -369,40 +258,6 @@ class TFCommonTestCases:
|
|||||||
x = model.get_output_embeddings()
|
x = model.get_output_embeddings()
|
||||||
assert x is None or isinstance(x, tf.keras.layers.Layer)
|
assert x is None or isinstance(x, tf.keras.layers.Layer)
|
||||||
|
|
||||||
|
|
||||||
def test_tie_model_weights(self):
|
|
||||||
pass
|
|
||||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
||||||
|
|
||||||
# def check_same_values(layer_1, layer_2):
|
|
||||||
# equal = True
|
|
||||||
# for p1, p2 in zip(layer_1.weight, layer_2.weight):
|
|
||||||
# if p1.data.ne(p2.data).sum() > 0:
|
|
||||||
# equal = False
|
|
||||||
# return equal
|
|
||||||
|
|
||||||
# for model_class in self.all_model_classes:
|
|
||||||
# if not hasattr(model_class, 'tie_weights'):
|
|
||||||
# continue
|
|
||||||
|
|
||||||
# config.torchscript = True
|
|
||||||
# model_not_tied = model_class(config)
|
|
||||||
# params_not_tied = list(model_not_tied.parameters())
|
|
||||||
|
|
||||||
# config_tied = copy.deepcopy(config)
|
|
||||||
# config_tied.torchscript = False
|
|
||||||
# model_tied = model_class(config_tied)
|
|
||||||
# params_tied = list(model_tied.parameters())
|
|
||||||
|
|
||||||
# # Check that the embedding layer and decoding layer are the same in size and in value
|
|
||||||
# self.assertGreater(len(params_not_tied), len(params_tied))
|
|
||||||
|
|
||||||
# # Check that after resize they remain tied.
|
|
||||||
# model_tied.resize_token_embeddings(config.vocab_size + 10)
|
|
||||||
# params_tied_2 = list(model_tied.parameters())
|
|
||||||
# self.assertGreater(len(params_not_tied), len(params_tied))
|
|
||||||
# self.assertEqual(len(params_tied_2), len(params_tied))
|
|
||||||
|
|
||||||
def test_determinism(self):
|
def test_determinism(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
@@ -426,9 +281,17 @@ class TFCommonTestCases:
|
|||||||
try:
|
try:
|
||||||
x = wte([input_ids], mode="embedding")
|
x = wte([input_ids], mode="embedding")
|
||||||
except:
|
except:
|
||||||
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
|
try:
|
||||||
|
x = wte([input_ids, None, None, None], mode="embedding")
|
||||||
|
except:
|
||||||
|
if hasattr(self.model_tester, "embedding_size"):
|
||||||
|
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
|
||||||
|
else:
|
||||||
|
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
|
||||||
# ^^ In our TF models, the input_embeddings can take slightly different forms,
|
# ^^ In our TF models, the input_embeddings can take slightly different forms,
|
||||||
# so we try two of them and fall back to just synthetically creating a dummy tensor of ones.
|
# so we try a few of them.
|
||||||
|
# We used to fall back to just synthetically creating a dummy tensor of ones:
|
||||||
|
#
|
||||||
inputs_dict["inputs_embeds"] = x
|
inputs_dict["inputs_embeds"] = x
|
||||||
outputs = model(inputs_dict)
|
outputs = model(inputs_dict)
|
||||||
|
|
||||||
@@ -453,29 +316,5 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
class TFModelUtilsTest(unittest.TestCase):
|
|
||||||
@pytest.mark.skipif('tensorflow' not in sys.modules, reason="requires TensorFlow")
|
|
||||||
def test_model_from_pretrained(self):
|
|
||||||
pass
|
|
||||||
# logging.basicConfig(level=logging.INFO)
|
|
||||||
# for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
|
||||||
# config = BertConfig.from_pretrained(model_name)
|
|
||||||
# self.assertIsNotNone(config)
|
|
||||||
# self.assertIsInstance(config, PretrainedConfig)
|
|
||||||
|
|
||||||
# model = BertModel.from_pretrained(model_name)
|
|
||||||
# model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
|
|
||||||
# self.assertIsNotNone(model)
|
|
||||||
# self.assertIsInstance(model, PreTrainedModel)
|
|
||||||
# for value in loading_info.values():
|
|
||||||
# self.assertEqual(len(value), 0)
|
|
||||||
|
|
||||||
# config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
|
||||||
# model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
|
||||||
# self.assertEqual(model.config.output_attentions, True)
|
|
||||||
# self.assertEqual(model.config.output_hidden_states, True)
|
|
||||||
# self.assertEqual(model.config, config)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import CTRLConfig, is_tf_available
|
from transformers import CTRLConfig, is_tf_available
|
||||||
|
|
||||||
@@ -30,10 +30,9 @@ if is_tf_available():
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
|
from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
|
||||||
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
|
all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
|
||||||
@@ -188,7 +187,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
|
self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -17,10 +17,10 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import DistilBertConfig, is_tf_available
|
from transformers import DistilBertConfig, is_tf_available
|
||||||
|
|
||||||
@@ -30,10 +30,9 @@ if is_tf_available():
|
|||||||
TFDistilBertForMaskedLM,
|
TFDistilBertForMaskedLM,
|
||||||
TFDistilBertForQuestionAnswering,
|
TFDistilBertForQuestionAnswering,
|
||||||
TFDistilBertForSequenceClassification)
|
TFDistilBertForSequenceClassification)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
|
all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
|
||||||
@@ -210,7 +209,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
|
self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
|
||||||
|
|
||||||
# @pytest.mark.slow
|
# @slow
|
||||||
# def test_model_from_pretrained(self):
|
# def test_model_from_pretrained(self):
|
||||||
# cache_dir = "/tmp/transformers_test/"
|
# cache_dir = "/tmp/transformers_test/"
|
||||||
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import GPT2Config, is_tf_available
|
from transformers import GPT2Config, is_tf_available
|
||||||
|
|
||||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
|||||||
from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
|
from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
|
||||||
TFGPT2DoubleHeadsModel,
|
TFGPT2DoubleHeadsModel,
|
||||||
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
|
all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
|
||||||
@@ -219,7 +218,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
|
self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import OpenAIGPTConfig, is_tf_available
|
from transformers import OpenAIGPTConfig, is_tf_available
|
||||||
|
|
||||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
|||||||
from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
||||||
TFOpenAIGPTDoubleHeadsModel,
|
TFOpenAIGPTDoubleHeadsModel,
|
||||||
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
||||||
@@ -218,7 +217,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
|
self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -18,10 +18,10 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import RobertaConfig, is_tf_available
|
from transformers import RobertaConfig, is_tf_available
|
||||||
|
|
||||||
@@ -32,10 +32,9 @@ if is_tf_available():
|
|||||||
TFRobertaForSequenceClassification,
|
TFRobertaForSequenceClassification,
|
||||||
TFRobertaForTokenClassification,
|
TFRobertaForTokenClassification,
|
||||||
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
|
all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
|
||||||
@@ -191,7 +190,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
@@ -203,10 +202,10 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
|
|
||||||
class TFRobertaModelIntegrationTest(unittest.TestCase):
|
class TFRobertaModelIntegrationTest(unittest.TestCase):
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_inference_masked_lm(self):
|
def test_inference_masked_lm(self):
|
||||||
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
|
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
|
||||||
|
|
||||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
output = model(input_ids)[0]
|
output = model(input_ids)[0]
|
||||||
expected_shape = [1, 11, 50265]
|
expected_shape = [1, 11, 50265]
|
||||||
@@ -224,10 +223,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
|
|||||||
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_inference_no_head(self):
|
def test_inference_no_head(self):
|
||||||
model = TFRobertaModel.from_pretrained('roberta-base')
|
model = TFRobertaModel.from_pretrained('roberta-base')
|
||||||
|
|
||||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
output = model(input_ids)[0]
|
output = model(input_ids)[0]
|
||||||
# compare the actual values for a slice.
|
# compare the actual values for a slice.
|
||||||
@@ -240,10 +239,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
|
|||||||
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_inference_classification_head(self):
|
def test_inference_classification_head(self):
|
||||||
model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
||||||
|
|
||||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||||
output = model(input_ids)[0]
|
output = model(input_ids)[0]
|
||||||
expected_shape = [1, 3]
|
expected_shape = [1, 3]
|
||||||
|
|||||||
@@ -19,10 +19,10 @@ from __future__ import print_function
|
|||||||
import unittest
|
import unittest
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import TransfoXLConfig, is_tf_available
|
from transformers import TransfoXLConfig, is_tf_available
|
||||||
|
|
||||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
|||||||
from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
|
from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
|
||||||
TFTransfoXLLMHeadModel,
|
TFTransfoXLLMHeadModel,
|
||||||
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
|
all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
|
||||||
@@ -204,7 +203,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
|
self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_tf_available
|
from transformers import is_tf_available
|
||||||
|
|
||||||
@@ -29,13 +28,13 @@ if is_tf_available():
|
|||||||
TFXLMForSequenceClassification,
|
TFXLMForSequenceClassification,
|
||||||
TFXLMForQuestionAnsweringSimple,
|
TFXLMForQuestionAnsweringSimple,
|
||||||
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
|
all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
|
||||||
@@ -251,7 +250,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
|
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ import unittest
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import XLNetConfig, is_tf_available
|
from transformers import XLNetConfig, is_tf_available
|
||||||
|
|
||||||
@@ -30,18 +29,21 @@ if is_tf_available():
|
|||||||
|
|
||||||
from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
|
from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
|
||||||
TFXLNetForSequenceClassification,
|
TFXLNetForSequenceClassification,
|
||||||
|
TFXLNetForTokenClassification,
|
||||||
TFXLNetForQuestionAnsweringSimple,
|
TFXLNetForQuestionAnsweringSimple,
|
||||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
|
all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
|
||||||
TFXLNetForSequenceClassification,
|
TFXLNetForSequenceClassification,
|
||||||
|
TFXLNetForTokenClassification,
|
||||||
TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
|
TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
|
||||||
test_pruning = False
|
test_pruning = False
|
||||||
|
|
||||||
@@ -258,6 +260,26 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
list(list(mem.shape) for mem in result["mems_1"]),
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_for_token_classification(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||||
|
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||||
|
config.num_labels = input_ids_1.shape[1]
|
||||||
|
model = TFXLNetForTokenClassification(config)
|
||||||
|
inputs = {'input_ids': input_ids_1,
|
||||||
|
'attention_mask': input_mask,
|
||||||
|
# 'token_type_ids': token_type_ids
|
||||||
|
}
|
||||||
|
logits, mems_1 = model(inputs)
|
||||||
|
result = {
|
||||||
|
"mems_1": [mem.numpy() for mem in mems_1],
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["logits"].shape),
|
||||||
|
[self.batch_size, self.seq_length, config.num_labels])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
def prepare_config_and_inputs_for_common(self):
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||||
@@ -282,19 +304,23 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
def test_xlnet_lm_head(self):
|
def test_xlnet_lm_head(self):
|
||||||
self.model_tester.set_seed()
|
self.model_tester.set_seed()
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||||
|
|
||||||
def test_xlnet_sequence_classif(self):
|
def test_xlnet_sequence_classif(self):
|
||||||
self.model_tester.set_seed()
|
self.model_tester.set_seed()
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
|
self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_xlnet_token_classification(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_xlnet_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
def test_xlnet_qa(self):
|
def test_xlnet_qa(self):
|
||||||
self.model_tester.set_seed()
|
self.model_tester.set_seed()
|
||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
|
self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ from __future__ import print_function
|
|||||||
import unittest
|
import unittest
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
@@ -27,12 +26,13 @@ if is_torch_available():
|
|||||||
import torch
|
import torch
|
||||||
from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||||
from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
|
all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
|
||||||
@@ -111,6 +111,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||||
model = TransfoXLModel(config)
|
model = TransfoXLModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
hidden_states_1, mems_1 = model(input_ids_1)
|
hidden_states_1, mems_1 = model(input_ids_1)
|
||||||
@@ -140,6 +141,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||||
model = TransfoXLLMHeadModel(config)
|
model = TransfoXLLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
lm_logits_1, mems_1 = model(input_ids_1)
|
lm_logits_1, mems_1 = model(input_ids_1)
|
||||||
@@ -204,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
|
output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
|
||||||
self.model_tester.check_transfo_xl_lm_head_output(output_result)
|
self.model_tester.check_transfo_xl_lm_head_output(output_result)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user