Merge branch 'master' into cli
This commit is contained in:
@@ -70,6 +70,27 @@ jobs:
|
|||||||
- run: sudo pip install pytest codecov pytest-cov
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
- run: python -m pytest -sv ./transformers/tests/ --cov
|
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||||
- run: codecov
|
- run: codecov
|
||||||
|
build_py3_custom_tokenizers:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.5
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest
|
||||||
|
- run: sudo pip install mecab-python3
|
||||||
|
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
|
||||||
|
build_py2_custom_tokenizers:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:2.7
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install --progress-bar off .
|
||||||
|
- run: sudo pip install pytest
|
||||||
|
- run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
|
||||||
|
- run: sudo pip install mecab-python
|
||||||
|
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
|
||||||
deploy_doc:
|
deploy_doc:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@@ -82,6 +103,16 @@ jobs:
|
|||||||
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||||
- run: sudo pip install --progress-bar off -r requirements.txt
|
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||||
- run: ./.circleci/deploy.sh
|
- run: ./.circleci/deploy.sh
|
||||||
|
repository_consistency:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.5
|
||||||
|
resource_class: small
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install requests
|
||||||
|
- run: python ./utils/link_tester.py
|
||||||
workflow_filters: &workflow_filters
|
workflow_filters: &workflow_filters
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
@@ -91,6 +122,9 @@ workflows:
|
|||||||
version: 2
|
version: 2
|
||||||
build_and_test:
|
build_and_test:
|
||||||
jobs:
|
jobs:
|
||||||
|
- repository_consistency
|
||||||
|
- build_py3_custom_tokenizers
|
||||||
|
- build_py2_custom_tokenizers
|
||||||
- build_py3_torch_and_tf
|
- build_py3_torch_and_tf
|
||||||
- build_py3_torch
|
- build_py3_torch
|
||||||
- build_py3_tf
|
- build_py3_tf
|
||||||
|
|||||||
46
README.md
46
README.md
@@ -56,9 +56,10 @@ Choose the right framework for every part of a model's lifetime
|
|||||||
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
||||||
| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
|
| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
|
||||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||||
|
| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
|
||||||
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
||||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||||
| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
| [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@@ -144,7 +145,8 @@ At some point in the future, you'll be able to seamlessly move from pre-training
|
|||||||
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||||
11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
|
13. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||||
|
|
||||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||||
|
|
||||||
@@ -445,6 +447,46 @@ python ./examples/run_generation.py \
|
|||||||
--repetition_penalty=1.2 \
|
--repetition_penalty=1.2 \
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Quick tour of model sharing
|
||||||
|
|
||||||
|
New in `v2.2.2`: you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
|
||||||
|
|
||||||
|
**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
transformers-cli login
|
||||||
|
# log in using the same credentials as on huggingface.co
|
||||||
|
```
|
||||||
|
Upload your model:
|
||||||
|
```shell
|
||||||
|
transformers-cli upload ./path/to/pretrained_model/
|
||||||
|
|
||||||
|
# ^^ Upload folder containing weights/tokenizer/config
|
||||||
|
# saved via `.save_pretrained()`
|
||||||
|
|
||||||
|
transformers-cli upload ./config.json [--filename folder/foobar.json]
|
||||||
|
|
||||||
|
# ^^ Upload a single file
|
||||||
|
# (you can optionally override its filename, which can be nested inside a folder)
|
||||||
|
```
|
||||||
|
|
||||||
|
Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
|
||||||
|
```python
|
||||||
|
"username/model_name"
|
||||||
|
```
|
||||||
|
|
||||||
|
Anyone can load it from code:
|
||||||
|
```python
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
|
||||||
|
model = AutoModel.from_pretrained("username/pretrained_model")
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, list all your files on S3:
|
||||||
|
```shell
|
||||||
|
transformers-cli ls
|
||||||
|
# List all your S3 objects.
|
||||||
|
```
|
||||||
|
|
||||||
## Migrating from pytorch-transformers to transformers
|
## Migrating from pytorch-transformers to transformers
|
||||||
|
|
||||||
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
|
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ author = u'huggingface'
|
|||||||
# The short X.Y version
|
# The short X.Y version
|
||||||
version = u''
|
version = u''
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = u'2.2.1'
|
release = u'2.2.2'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||||||
installation
|
installation
|
||||||
quickstart
|
quickstart
|
||||||
pretrained_models
|
pretrained_models
|
||||||
|
model_sharing
|
||||||
examples
|
examples
|
||||||
notebooks
|
notebooks
|
||||||
serialization
|
serialization
|
||||||
|
|||||||
40
docs/source/model_sharing.md
Normal file
40
docs/source/model_sharing.md
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# Model upload and sharing
|
||||||
|
|
||||||
|
Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
|
||||||
|
|
||||||
|
**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
transformers-cli login
|
||||||
|
# log in using the same credentials as on huggingface.co
|
||||||
|
```
|
||||||
|
Upload your model:
|
||||||
|
```shell
|
||||||
|
transformers-cli upload ./path/to/pretrained_model/
|
||||||
|
|
||||||
|
# ^^ Upload folder containing weights/tokenizer/config
|
||||||
|
# saved via `.save_pretrained()`
|
||||||
|
|
||||||
|
transformers-cli upload ./config.json [--filename folder/foobar.json]
|
||||||
|
|
||||||
|
# ^^ Upload a single file
|
||||||
|
# (you can optionally override its filename, which can be nested inside a folder)
|
||||||
|
```
|
||||||
|
|
||||||
|
Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
|
||||||
|
```python
|
||||||
|
"username/pretrained_model"
|
||||||
|
```
|
||||||
|
|
||||||
|
Anyone can load it from code:
|
||||||
|
```python
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
|
||||||
|
model = AutoModel.from_pretrained("username/pretrained_model")
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, list all your files on S3:
|
||||||
|
```shell
|
||||||
|
transformers-cli ls
|
||||||
|
# List all your S3 objects.
|
||||||
|
```
|
||||||
|
|
||||||
@@ -61,6 +61,32 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
| | | | Trained on uncased German text by DBMDZ |
|
| | | | Trained on uncased German text by DBMDZ |
|
||||||
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
|
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
|
| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. |
|
||||||
|
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
|
||||||
|
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
|
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. |
|
||||||
|
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
|
||||||
|
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
|
| | | | Trained on Japanese text. Text is tokenized into characters. |
|
||||||
|
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
|
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
|
||||||
|
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``bert-base-finnish-cased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
|
| | | | Trained on cased Finnish text. |
|
||||||
|
| | | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__). |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``bert-base-finnish-uncased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
|
| | | | Trained on uncased Finnish text. |
|
||||||
|
| | | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__). |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||||
| | | | OpenAI GPT English model |
|
| | | | OpenAI GPT English model |
|
||||||
@@ -169,35 +195,50 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||||
| | | | ALBERT base model |
|
| | | | ALBERT base model |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||||
| | | | ALBERT large model |
|
| | | | ALBERT large model |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||||
| | | | ALBERT xlarge model |
|
| | | | ALBERT xlarge model |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||||
| | | | ALBERT xxlarge model |
|
| | | | ALBERT xxlarge model |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||||
| | | | ALBERT base model with no dropout, additional training data and longer training |
|
| | | | ALBERT base model with no dropout, additional training data and longer training |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||||
| | | | ALBERT large model with no dropout, additional training data and longer training |
|
| | | | ALBERT large model with no dropout, additional training data and longer training |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||||
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
|
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||||
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
||||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| T5 | ``t5-small`` | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads, |
|
||||||
|
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``t5-base`` | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads, |
|
||||||
|
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``t5-large`` | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads, |
|
||||||
|
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``t5-3B`` | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads, |
|
||||||
|
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``t5-11B`` | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads, |
|
||||||
|
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,8 +24,6 @@ pip install -r ./examples/requirements.txt
|
|||||||
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
||||||
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
||||||
| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
|
| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
|
||||||
| [Abstractive summarization](#abstractive-summarization) | Using the BertAbs
|
|
||||||
model finetuned on the CNN/DailyMail dataset to generate summaries. |
|
|
||||||
|
|
||||||
## TensorFlow 2.0 Bert models on GLUE
|
## TensorFlow 2.0 Bert models on GLUE
|
||||||
|
|
||||||
@@ -469,7 +467,7 @@ Training with the previously defined hyper-parameters yields the following resul
|
|||||||
## Named Entity Recognition
|
## Named Entity Recognition
|
||||||
|
|
||||||
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
|
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
|
||||||
[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
|
[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
|
||||||
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
||||||
Details and results for the fine-tuning provided by @stefan-it.
|
Details and results for the fine-tuning provided by @stefan-it.
|
||||||
|
|
||||||
@@ -646,34 +644,6 @@ micro avg 0.8722 0.8774 0.8748 13869
|
|||||||
macro avg 0.8712 0.8774 0.8740 13869
|
macro avg 0.8712 0.8774 0.8740 13869
|
||||||
```
|
```
|
||||||
|
|
||||||
## Abstractive summarization
|
|
||||||
|
|
||||||
Based on the script
|
|
||||||
[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
|
|
||||||
|
|
||||||
Before running this script you should download **both** CNN and Daily Mail
|
|
||||||
datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the
|
|
||||||
links next to "Stories") in the same folder. Then uncompress the archives by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
|
|
||||||
```
|
|
||||||
|
|
||||||
note that the finetuning script **will not work** if you do not download both
|
|
||||||
datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both
|
|
||||||
archive.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export DATA_PATH=/path/to/dataset/
|
|
||||||
|
|
||||||
python run_summarization_finetuning.py \
|
|
||||||
--output_dir=output \
|
|
||||||
--model_type=bert2bert \
|
|
||||||
--model_name_or_path=bert2bert \
|
|
||||||
--do_train \
|
|
||||||
--data_path=$DATA_PATH \
|
|
||||||
```
|
|
||||||
|
|
||||||
## XNLI
|
## XNLI
|
||||||
|
|
||||||
Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
|
Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
|
||||||
|
|||||||
@@ -247,7 +247,11 @@ def main():
|
|||||||
out = out[:, len(context_tokens):].tolist()
|
out = out[:, len(context_tokens):].tolist()
|
||||||
for o in out:
|
for o in out:
|
||||||
text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
|
text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
|
||||||
text = text[: text.find(args.stop_token) if args.stop_token else None]
|
if args.stop_token:
|
||||||
|
index = text.find(args.stop_token)
|
||||||
|
if index == -1:
|
||||||
|
index = None
|
||||||
|
text = text[:index]
|
||||||
|
|
||||||
print(text)
|
print(text)
|
||||||
|
|
||||||
|
|||||||
@@ -380,7 +380,7 @@ def main():
|
|||||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||||
help="The initial learning rate for Adam.")
|
help="The initial learning rate for Adam.")
|
||||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||||
help="Weight deay if we apply some.")
|
help="Weight decay if we apply some.")
|
||||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||||
help="Epsilon for Adam optimizer.")
|
help="Epsilon for Adam optimizer.")
|
||||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||||
|
|||||||
@@ -61,7 +61,6 @@ MODEL_CLASSES = {
|
|||||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||||
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
|
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
|
||||||
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
|
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def set_seed(args):
|
def set_seed(args):
|
||||||
@@ -223,7 +222,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
# multi-gpu evaluate
|
# multi-gpu evaluate
|
||||||
if args.n_gpu > 1:
|
if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
|
||||||
model = torch.nn.DataParallel(model)
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Eval!
|
# Eval!
|
||||||
@@ -299,10 +298,13 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
|
|
||||||
# XLNet and XLM use a more complex post-processing procedure
|
# XLNet and XLM use a more complex post-processing procedure
|
||||||
if args.model_type in ['xlnet', 'xlm']:
|
if args.model_type in ['xlnet', 'xlm']:
|
||||||
|
start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
|
||||||
|
end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
|
||||||
|
|
||||||
predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
|
predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
|
||||||
args.max_answer_length, output_prediction_file,
|
args.max_answer_length, output_prediction_file,
|
||||||
output_nbest_file, output_null_log_odds_file,
|
output_nbest_file, output_null_log_odds_file,
|
||||||
model.config.start_n_top, model.config.end_n_top,
|
start_n_top, end_n_top,
|
||||||
args.version_2_with_negative, tokenizer, args.verbose_logging)
|
args.version_2_with_negative, tokenizer, args.verbose_logging)
|
||||||
else:
|
else:
|
||||||
predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
|
predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
|
||||||
@@ -334,7 +336,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
|
|||||||
else:
|
else:
|
||||||
logger.info("Creating features from dataset file at %s", input_dir)
|
logger.info("Creating features from dataset file at %s", input_dir)
|
||||||
|
|
||||||
if not args.data_dir:
|
if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
|
||||||
try:
|
try:
|
||||||
import tensorflow_datasets as tfds
|
import tensorflow_datasets as tfds
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -347,7 +349,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
|
|||||||
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||||
else:
|
else:
|
||||||
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
|
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
|
||||||
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
|
||||||
|
if evaluate:
|
||||||
|
examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
|
||||||
|
else:
|
||||||
|
examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
|
||||||
|
|
||||||
features, dataset = squad_convert_examples_to_features(
|
features, dataset = squad_convert_examples_to_features(
|
||||||
examples=examples,
|
examples=examples,
|
||||||
@@ -384,7 +390,14 @@ def main():
|
|||||||
|
|
||||||
## Other parameters
|
## Other parameters
|
||||||
parser.add_argument("--data_dir", default=None, type=str,
|
parser.add_argument("--data_dir", default=None, type=str,
|
||||||
help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.")
|
help="The input data dir. Should contain the .json files for the task." +
|
||||||
|
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||||
|
parser.add_argument("--train_file", default=None, type=str,
|
||||||
|
help="The input training file. If a data dir is specified, will look for the file there" +
|
||||||
|
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||||
|
parser.add_argument("--predict_file", default=None, type=str,
|
||||||
|
help="The input evaluation file. If a data dir is specified, will look for the file there" +
|
||||||
|
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||||
parser.add_argument("--config_name", default="", type=str,
|
parser.add_argument("--config_name", default="", type=str,
|
||||||
help="Pretrained config name or path if not the same as model_name")
|
help="Pretrained config name or path if not the same as model_name")
|
||||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||||
@@ -469,11 +482,6 @@ def main():
|
|||||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format(
|
|
||||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
|
||||||
str(args.max_seq_length))
|
|
||||||
)
|
|
||||||
|
|
||||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||||
|
|
||||||
@@ -571,10 +579,16 @@ def main():
|
|||||||
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
|
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
|
||||||
results = {}
|
results = {}
|
||||||
if args.do_eval and args.local_rank in [-1, 0]:
|
if args.do_eval and args.local_rank in [-1, 0]:
|
||||||
checkpoints = [args.output_dir]
|
|
||||||
if args.eval_all_checkpoints:
|
if args.do_train:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
logger.info("Loading checkpoints saved during training for evaluation")
|
||||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
checkpoints = [args.output_dir]
|
||||||
|
if args.eval_all_checkpoints:
|
||||||
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||||
|
else:
|
||||||
|
logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
|
||||||
|
checkpoints = [args.model_name_or_path]
|
||||||
|
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ And move all the stories to the same folder. We will refer as `$DATA_PATH` the p
|
|||||||
python run_summarization.py \
|
python run_summarization.py \
|
||||||
--documents_dir $DATA_PATH \
|
--documents_dir $DATA_PATH \
|
||||||
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
||||||
--to_cpu false \
|
--no_cuda false \
|
||||||
--batch_size 4 \
|
--batch_size 4 \
|
||||||
--min_length 50 \
|
--min_length 50 \
|
||||||
--max_length 200 \
|
--max_length 200 \
|
||||||
@@ -39,7 +39,7 @@ python run_summarization.py \
|
|||||||
--compute_rouge true
|
--compute_rouge true
|
||||||
```
|
```
|
||||||
|
|
||||||
The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
|
The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
|
||||||
|
|
||||||
## Summarize any text
|
## Summarize any text
|
||||||
|
|
||||||
@@ -49,7 +49,7 @@ Put the documents that you would like to summarize in a folder (the path to whic
|
|||||||
python run_summarization.py \
|
python run_summarization.py \
|
||||||
--documents_dir $DATA_PATH \
|
--documents_dir $DATA_PATH \
|
||||||
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
||||||
--to_cpu false \
|
--no_cuda false \
|
||||||
--batch_size 4 \
|
--batch_size 4 \
|
||||||
--min_length 50 \
|
--min_length 50 \
|
||||||
--max_length 200 \
|
--max_length 200 \
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
r""" Class to store the configuration of the BertAbs model.
|
r""" Class to store the configuration of the BertAbs model.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
vocab_size: int
|
||||||
|
Number of tokens in the vocabulary.
|
||||||
max_pos: int
|
max_pos: int
|
||||||
The maximum sequence length that this model will be used with.
|
The maximum sequence length that this model will be used with.
|
||||||
enc_layer: int
|
enc_layer: int
|
||||||
@@ -65,7 +67,7 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size=30522,
|
||||||
max_pos=512,
|
max_pos=512,
|
||||||
enc_layers=6,
|
enc_layers=6,
|
||||||
enc_hidden_size=512,
|
enc_hidden_size=512,
|
||||||
@@ -81,39 +83,17 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
):
|
):
|
||||||
super(BertAbsConfig, self).__init__(**kwargs)
|
super(BertAbsConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
if self._input_is_path_to_json(vocab_size_or_config_json_file):
|
self.vocab_size = vocab_size
|
||||||
path_to_json = vocab_size_or_config_json_file
|
self.max_pos = max_pos
|
||||||
with open(path_to_json, "r", encoding="utf-8") as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
|
||||||
self.max_pos = max_pos
|
|
||||||
|
|
||||||
self.enc_layers = enc_layers
|
self.enc_layers = enc_layers
|
||||||
self.enc_hidden_size = enc_hidden_size
|
self.enc_hidden_size = enc_hidden_size
|
||||||
self.enc_heads = enc_heads
|
self.enc_heads = enc_heads
|
||||||
self.enc_ff_size = enc_ff_size
|
self.enc_ff_size = enc_ff_size
|
||||||
self.enc_dropout = enc_dropout
|
self.enc_dropout = enc_dropout
|
||||||
|
|
||||||
self.dec_layers = dec_layers
|
self.dec_layers = dec_layers
|
||||||
self.dec_hidden_size = dec_hidden_size
|
self.dec_hidden_size = dec_hidden_size
|
||||||
self.dec_heads = dec_heads
|
self.dec_heads = dec_heads
|
||||||
self.dec_ff_size = dec_ff_size
|
self.dec_ff_size = dec_ff_size
|
||||||
self.dec_dropout = dec_dropout
|
self.dec_dropout = dec_dropout
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _input_is_path_to_json(self, first_argument):
|
|
||||||
""" Checks whether the first argument passed to config
|
|
||||||
is the path to a JSON file that contains the config.
|
|
||||||
"""
|
|
||||||
is_python_2 = sys.version_info[0] == 2
|
|
||||||
if is_python_2:
|
|
||||||
return isinstance(first_argument, unicode)
|
|
||||||
else:
|
|
||||||
return isinstance(first_argument, str)
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ boto3
|
|||||||
# Used for downloading models over HTTP
|
# Used for downloading models over HTTP
|
||||||
requests
|
requests
|
||||||
# For OpenAI GPT
|
# For OpenAI GPT
|
||||||
regex
|
regex != 2019.12.17
|
||||||
# For XLNet
|
# For XLNet
|
||||||
sentencepiece
|
sentencepiece
|
||||||
# For XLM
|
# For XLM
|
||||||
|
|||||||
4
setup.py
4
setup.py
@@ -46,7 +46,7 @@ extras['all'] = [package for package in extras.values()]
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="transformers",
|
name="transformers",
|
||||||
version="2.2.1",
|
version="2.2.2",
|
||||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||||
author_email="thomas@huggingface.co",
|
author_email="thomas@huggingface.co",
|
||||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||||
@@ -61,7 +61,7 @@ setup(
|
|||||||
'boto3',
|
'boto3',
|
||||||
'requests',
|
'requests',
|
||||||
'tqdm',
|
'tqdm',
|
||||||
'regex',
|
'regex != 2019.12.17',
|
||||||
'sentencepiece',
|
'sentencepiece',
|
||||||
'sacremoses'],
|
'sacremoses'],
|
||||||
extras_require=extras,
|
extras_require=extras,
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig):
|
|||||||
|
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
|
vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
|
||||||
hidden_size: Size of the encoder layers and the pooler layer.
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads: Number of attention heads for each attention layer in
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
@@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=50257,
|
vocab_size=50257,
|
||||||
n_positions=1024,
|
n_positions=1024,
|
||||||
n_ctx=1024,
|
n_ctx=1024,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -75,8 +75,6 @@ class XxxConfig(PretrainedConfig):
|
|||||||
attn_pdrop=0.1,
|
attn_pdrop=0.1,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
summary_type='cls_index',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -84,7 +82,7 @@ class XxxConfig(PretrainedConfig):
|
|||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super(XxxConfig, self).__init__(**kwargs)
|
super(XxxConfig, self).__init__(**kwargs)
|
||||||
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
|
self.vocab_size = vocab_size
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.n_embd = n_embd
|
self.n_embd = n_embd
|
||||||
@@ -95,23 +93,11 @@ class XxxConfig(PretrainedConfig):
|
|||||||
self.attn_pdrop = attn_pdrop
|
self.attn_pdrop = attn_pdrop
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
self.summary_type = summary_type
|
||||||
self.summary_use_proj = summary_use_proj
|
self.summary_use_proj = summary_use_proj
|
||||||
self.summary_activation = summary_activation
|
self.summary_activation = summary_activation
|
||||||
self.summary_first_dropout = summary_first_dropout
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
if isinstance(vocab_size_or_config_json_file, six.string_types):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
|||||||
@@ -26,9 +26,9 @@ from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
|
|||||||
import logging
|
import logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path):
|
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
|
||||||
# Initialise PyTorch model
|
# Initialise PyTorch model
|
||||||
config = XxxConfig.from_json_file(xxx_config_file)
|
config = XxxConfig.from_json_file(config_file)
|
||||||
print("Building PyTorch model from configuration: {}".format(str(config)))
|
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||||
model = XxxForPreTraining(config)
|
model = XxxForPreTraining(config)
|
||||||
|
|
||||||
@@ -48,11 +48,11 @@ if __name__ == "__main__":
|
|||||||
type = str,
|
type = str,
|
||||||
required = True,
|
required = True,
|
||||||
help = "Path to the TensorFlow checkpoint path.")
|
help = "Path to the TensorFlow checkpoint path.")
|
||||||
parser.add_argument("--xxx_config_file",
|
parser.add_argument("--config_file",
|
||||||
default = None,
|
default = None,
|
||||||
type = str,
|
type = str,
|
||||||
required = True,
|
required = True,
|
||||||
help = "The config json file corresponding to the pre-trained XXX model. \n"
|
help = "The config json file corresponding to the pre-trained model. \n"
|
||||||
"This specifies the model architecture.")
|
"This specifies the model architecture.")
|
||||||
parser.add_argument("--pytorch_dump_path",
|
parser.add_argument("--pytorch_dump_path",
|
||||||
default = None,
|
default = None,
|
||||||
@@ -61,5 +61,5 @@ if __name__ == "__main__":
|
|||||||
help = "Path to the output PyTorch model.")
|
help = "Path to the output PyTorch model.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
|
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
|
||||||
args.xxx_config_file,
|
args.config_file,
|
||||||
args.pytorch_dump_path)
|
args.pytorch_dump_path)
|
||||||
|
|||||||
@@ -26,6 +26,8 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = XxxConfig(
|
config = XxxConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = XxxConfig(
|
config = XxxConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ class XxxTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
||||||
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
|
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
__version__ = "2.2.1"
|
__version__ = "2.2.2"
|
||||||
|
|
||||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||||
# default Python logging output behavior when present.
|
# default Python logging output behavior when present.
|
||||||
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
|||||||
# Files and general utilities
|
# Files and general utilities
|
||||||
from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
|
from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
|
||||||
cached_path, add_start_docstrings, add_end_docstrings,
|
cached_path, add_start_docstrings, add_end_docstrings,
|
||||||
WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
|
WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
|
||||||
is_tf_available, is_torch_available)
|
is_tf_available, is_torch_available)
|
||||||
|
|
||||||
from .data import (is_sklearn_available,
|
from .data import (is_sklearn_available,
|
||||||
@@ -34,10 +34,14 @@ from .data import (is_sklearn_available,
|
|||||||
if is_sklearn_available():
|
if is_sklearn_available():
|
||||||
from .data import glue_compute_metrics, xnli_compute_metrics
|
from .data import glue_compute_metrics, xnli_compute_metrics
|
||||||
|
|
||||||
|
# Model Cards
|
||||||
|
from .model_card import ModelCard
|
||||||
|
|
||||||
# Tokenizers
|
# Tokenizers
|
||||||
from .tokenization_utils import (PreTrainedTokenizer)
|
from .tokenization_utils import (PreTrainedTokenizer)
|
||||||
from .tokenization_auto import AutoTokenizer
|
from .tokenization_auto import AutoTokenizer
|
||||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||||
|
from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
|
||||||
from .tokenization_openai import OpenAIGPTTokenizer
|
from .tokenization_openai import OpenAIGPTTokenizer
|
||||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||||
from .tokenization_gpt2 import GPT2Tokenizer
|
from .tokenization_gpt2 import GPT2Tokenizer
|
||||||
@@ -48,28 +52,29 @@ from .tokenization_roberta import RobertaTokenizer
|
|||||||
from .tokenization_distilbert import DistilBertTokenizer
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
from .tokenization_albert import AlbertTokenizer
|
from .tokenization_albert import AlbertTokenizer
|
||||||
from .tokenization_camembert import CamembertTokenizer
|
from .tokenization_camembert import CamembertTokenizer
|
||||||
|
from .tokenization_t5 import T5Tokenizer
|
||||||
|
|
||||||
# Configurations
|
# Configurations
|
||||||
from .configuration_utils import PretrainedConfig
|
from .configuration_utils import PretrainedConfig
|
||||||
from .configuration_auto import AutoConfig
|
from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
# Modeling
|
# Modeling
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
|
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
|
||||||
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
||||||
AutoModelWithLMHead, AutoModelForTokenClassification)
|
AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
|
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
|
||||||
BertForMaskedLM, BertForNextSentencePrediction,
|
BertForMaskedLM, BertForNextSentencePrediction,
|
||||||
@@ -77,8 +82,8 @@ if is_torch_available():
|
|||||||
BertForTokenClassification, BertForQuestionAnswering,
|
BertForTokenClassification, BertForQuestionAnswering,
|
||||||
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
||||||
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
||||||
AdaptiveEmbedding,
|
AdaptiveEmbedding,
|
||||||
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
@@ -110,6 +115,9 @@ if is_torch_available():
|
|||||||
CamembertForTokenClassification,
|
CamembertForTokenClassification,
|
||||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||||
|
from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
|
||||||
|
load_tf_weights_in_t5,
|
||||||
|
T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
|
from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
|
||||||
AlbertForQuestionAnswering,
|
AlbertForQuestionAnswering,
|
||||||
@@ -124,7 +132,7 @@ if is_torch_available():
|
|||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
|
||||||
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
||||||
TFAutoModelWithLMHead, TFAutoModelForTokenClassification)
|
TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
|
from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
|
||||||
TFBertModel, TFBertForPreTraining,
|
TFBertModel, TFBertForPreTraining,
|
||||||
@@ -178,6 +186,10 @@ if is_tf_available():
|
|||||||
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
|
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
|
||||||
TFAlbertForSequenceClassification,
|
TFAlbertForSequenceClassification,
|
||||||
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
|
||||||
|
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
|
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ def main():
|
|||||||
# parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
|
# parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
|
||||||
# commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
|
# commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
|
||||||
|
|
||||||
|
|
||||||
# # Register commands
|
# # Register commands
|
||||||
# ServeCommand.register_subcommand(commands_parser)
|
# ServeCommand.register_subcommand(commands_parser)
|
||||||
|
|
||||||
|
|||||||
@@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand):
|
|||||||
list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
||||||
# upload
|
# upload
|
||||||
upload_parser = parser.add_parser('upload')
|
upload_parser = parser.add_parser('upload')
|
||||||
upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
|
upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
|
||||||
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
|
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
|
||||||
upload_parser.set_defaults(func=lambda args: UploadCommand(args))
|
upload_parser.set_defaults(func=lambda args: UploadCommand(args))
|
||||||
|
|
||||||
|
|
||||||
@@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand):
|
|||||||
|
|
||||||
|
|
||||||
class UploadCommand(BaseUserCommand):
|
class UploadCommand(BaseUserCommand):
|
||||||
|
def walk_dir(self, rel_path):
|
||||||
|
"""
|
||||||
|
Recursively list all files in a folder.
|
||||||
|
"""
|
||||||
|
entries: List[os.DirEntry] = list(os.scandir(rel_path))
|
||||||
|
files = [
|
||||||
|
(
|
||||||
|
os.path.join(os.getcwd(), f.path), # filepath
|
||||||
|
f.path # filename
|
||||||
|
)
|
||||||
|
for f in entries if f.is_file()
|
||||||
|
]
|
||||||
|
for f in entries:
|
||||||
|
if f.is_dir():
|
||||||
|
files += self.walk_dir(f.path)
|
||||||
|
return files
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
token = HfFolder.get_token()
|
token = HfFolder.get_token()
|
||||||
if token is None:
|
if token is None:
|
||||||
print("Not logged in")
|
print("Not logged in")
|
||||||
exit(1)
|
exit(1)
|
||||||
filepath = os.path.join(os.getcwd(), self.args.file)
|
local_path = os.path.abspath(self.args.path)
|
||||||
filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
|
if os.path.isdir(local_path):
|
||||||
print(
|
if self.args.filename is not None:
|
||||||
"About to upload file {} to S3 under filename {}".format(
|
raise ValueError("Cannot specify a filename override when uploading a folder.")
|
||||||
ANSI.bold(filepath), ANSI.bold(filename)
|
rel_path = os.path.basename(local_path)
|
||||||
|
files = self.walk_dir(rel_path)
|
||||||
|
elif os.path.isfile(local_path):
|
||||||
|
filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
|
||||||
|
files = [(local_path, filename)]
|
||||||
|
else:
|
||||||
|
raise ValueError("Not a valid file or directory: {}".format(local_path))
|
||||||
|
|
||||||
|
for filepath, filename in files:
|
||||||
|
print(
|
||||||
|
"About to upload file {} to S3 under filename {}".format(
|
||||||
|
ANSI.bold(filepath), ANSI.bold(filename)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
choice = input("Proceed? [Y/n] ").lower()
|
choice = input("Proceed? [Y/n] ").lower()
|
||||||
if not(choice == "" or choice == "y" or choice == "yes"):
|
if not(choice == "" or choice == "y" or choice == "yes"):
|
||||||
print("Abort")
|
print("Abort")
|
||||||
exit()
|
exit()
|
||||||
print(
|
print(
|
||||||
ANSI.bold("Uploading... This might take a while if file is large")
|
ANSI.bold("Uploading... This might take a while if files are large")
|
||||||
)
|
)
|
||||||
access_url = self._api.presign_and_upload(
|
for filepath, filename in files:
|
||||||
token=token, filename=filename, filepath=filepath
|
access_url = self._api.presign_and_upload(
|
||||||
)
|
token=token, filename=filename, filepath=filepath
|
||||||
print("Your file now lives at:")
|
)
|
||||||
print(access_url)
|
print("Your file now lives at:")
|
||||||
|
print(access_url)
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30000,
|
vocab_size=30000,
|
||||||
embedding_size=128,
|
embedding_size=128,
|
||||||
hidden_size=4096,
|
hidden_size=4096,
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=12,
|
||||||
@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig):
|
|||||||
"""
|
"""
|
||||||
super(AlbertConfig, self).__init__(**kwargs)
|
super(AlbertConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.vocab_size = vocab_size
|
||||||
self.embedding_size = embedding_size
|
self.embedding_size = embedding_size
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
@@ -97,4 +97,4 @@ class AlbertConfig(PretrainedConfig):
|
|||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.type_vocab_size = type_vocab_size
|
self.type_vocab_size = type_vocab_size
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
|||||||
@@ -18,21 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .configuration_bert import BertConfig
|
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_openai import OpenAIGPTConfig
|
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_gpt2 import GPT2Config
|
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_transfo_xl import TransfoXLConfig
|
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_xlnet import XLNetConfig
|
from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_xlm import XLMConfig
|
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_roberta import RobertaConfig
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_distilbert import DistilBertConfig
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_ctrl import CTRLConfig
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_camembert import CamembertConfig
|
from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_albert import AlbertConfig
|
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
|
||||||
|
for pretrained_map in [
|
||||||
|
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
]
|
||||||
|
for key, value, in pretrained_map.items())
|
||||||
|
|
||||||
|
|
||||||
class AutoConfig(object):
|
class AutoConfig(object):
|
||||||
r""":class:`~transformers.AutoConfig` is a generic configuration class
|
r""":class:`~transformers.AutoConfig` is a generic configuration class
|
||||||
that will be instantiated as one of the configuration classes of the library
|
that will be instantiated as one of the configuration classes of the library
|
||||||
@@ -96,6 +115,7 @@ class AutoConfig(object):
|
|||||||
|
|
||||||
The configuration class to instantiate is selected as the first pattern matching
|
The configuration class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: T5Config (T5 model)
|
||||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||||
- contains `albert`: AlbertConfig (ALBERT model)
|
- contains `albert`: AlbertConfig (ALBERT model)
|
||||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||||
@@ -111,6 +131,7 @@ class AutoConfig(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
@@ -151,7 +172,9 @@ class AutoConfig(object):
|
|||||||
assert unused_kwargs == {'foo': False}
|
assert unused_kwargs == {'foo': False}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 't5' in pretrained_model_name_or_path:
|
||||||
|
return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
elif 'albert' in pretrained_model_name_or_path:
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|||||||
@@ -42,6 +42,12 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
|
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
|
||||||
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
|
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
|
||||||
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
|
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
|
||||||
|
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
|
||||||
|
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
|
||||||
|
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
|
||||||
|
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
|
||||||
|
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
|
||||||
|
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -52,7 +58,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
|
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
|
vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
|
||||||
hidden_size: Size of the encoder layers and the pooler layer.
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads: Number of attention heads for each attention layer in
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
@@ -77,7 +83,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size=30522,
|
||||||
hidden_size=768,
|
hidden_size=768,
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=12,
|
||||||
num_attention_heads=12,
|
num_attention_heads=12,
|
||||||
@@ -91,25 +97,15 @@ class BertConfig(PretrainedConfig):
|
|||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super(BertConfig, self).__init__(**kwargs)
|
super(BertConfig, self).__init__(**kwargs)
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.vocab_size = vocab_size
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.hidden_size = hidden_size
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
self.num_hidden_layers = num_hidden_layers
|
||||||
json_config = json.loads(reader.read())
|
self.num_attention_heads = num_attention_heads
|
||||||
for key, value in json_config.items():
|
self.hidden_act = hidden_act
|
||||||
self.__dict__[key] = value
|
self.intermediate_size = intermediate_size
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
self.hidden_size = hidden_size
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.type_vocab_size = type_vocab_size
|
||||||
self.num_attention_heads = num_attention_heads
|
self.initializer_range = initializer_range
|
||||||
self.hidden_act = hidden_act
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.layer_norm_eps = layer_norm_eps
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `CTRLModel`.
|
"""Configuration class to store the configuration of a `CTRLModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
dff: Size of the inner dimension of the FFN.
|
dff: Size of the inner dimension of the FFN.
|
||||||
@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=246534,
|
vocab_size=246534,
|
||||||
n_positions=256,
|
n_positions=256,
|
||||||
n_ctx=256,
|
n_ctx=256,
|
||||||
n_embd=1280,
|
n_embd=1280,
|
||||||
@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
attn_pdrop=0.1,
|
attn_pdrop=0.1,
|
||||||
layer_norm_epsilon=1e-6,
|
layer_norm_epsilon=1e-6,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
summary_type='cls_index',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
"""Constructs CTRLConfig.
|
"""Constructs CTRLConfig.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
dff: Size of the inner dimension of the FFN.
|
dff: Size of the inner dimension of the FFN.
|
||||||
@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
"""
|
"""
|
||||||
super(CTRLConfig, self).__init__(**kwargs)
|
super(CTRLConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.n_embd = n_embd
|
self.n_embd = n_embd
|
||||||
@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
self.summary_type = summary_type
|
||||||
self.summary_use_proj = summary_use_proj
|
self.summary_use_proj = summary_use_proj
|
||||||
self.summary_activation = summary_activation
|
self.summary_activation = summary_activation
|
||||||
self.summary_first_dropout = summary_first_dropout
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30522,
|
vocab_size=30522,
|
||||||
max_position_embeddings=512,
|
max_position_embeddings=512,
|
||||||
sinusoidal_pos_embds=False,
|
sinusoidal_pos_embds=False,
|
||||||
n_layers=6,
|
n_layers=6,
|
||||||
@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
seq_classif_dropout=0.2,
|
seq_classif_dropout=0.2,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super(DistilBertConfig, self).__init__(**kwargs)
|
super(DistilBertConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.n_heads = n_heads
|
||||||
|
self.dim = dim
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.dropout = dropout
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.activation = activation
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.tie_weights_ = tie_weights_
|
||||||
|
self.qa_dropout = qa_dropout
|
||||||
|
self.seq_classif_dropout = seq_classif_dropout
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.dim = dim
|
|
||||||
self.hidden_dim = hidden_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
self.attention_dropout = attention_dropout
|
|
||||||
self.activation = activation
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.tie_weights_ = tie_weights_
|
|
||||||
self.qa_dropout = qa_dropout
|
|
||||||
self.seq_classif_dropout = seq_classif_dropout
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
return self.dim
|
return self.dim
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=50257,
|
vocab_size=50257,
|
||||||
n_positions=1024,
|
n_positions=1024,
|
||||||
n_ctx=1024,
|
n_ctx=1024,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
attn_pdrop=0.1,
|
attn_pdrop=0.1,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
summary_type='cls_index',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
"""Constructs GPT2Config.
|
"""Constructs GPT2Config.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
|
|||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
"""
|
"""
|
||||||
super(GPT2Config, self).__init__(**kwargs)
|
super(GPT2Config, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.n_ctx = n_ctx
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.n_positions = n_positions
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
self.n_embd = n_embd
|
||||||
json_config = json.loads(reader.read())
|
self.n_layer = n_layer
|
||||||
for key, value in json_config.items():
|
self.n_head = n_head
|
||||||
self.__dict__[key] = value
|
self.resid_pdrop = resid_pdrop
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
self.embd_pdrop = embd_pdrop
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.attn_pdrop = attn_pdrop
|
||||||
self.n_ctx = n_ctx
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.n_positions = n_positions
|
self.initializer_range = initializer_range
|
||||||
self.n_embd = n_embd
|
self.summary_type = summary_type
|
||||||
self.n_layer = n_layer
|
self.summary_use_proj = summary_use_proj
|
||||||
self.n_head = n_head
|
self.summary_activation = summary_activation
|
||||||
self.resid_pdrop = resid_pdrop
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.embd_pdrop = embd_pdrop
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
self.attn_pdrop = attn_pdrop
|
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=40478,
|
vocab_size=40478,
|
||||||
n_positions=512,
|
n_positions=512,
|
||||||
n_ctx=512,
|
n_ctx=512,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
predict_special_tokens=True,
|
predict_special_tokens=True,
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
summary_type='cls_index',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
"""Constructs OpenAIGPTConfig.
|
"""Constructs OpenAIGPTConfig.
|
||||||
"""
|
"""
|
||||||
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.n_ctx = n_ctx
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.n_positions = n_positions
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
self.n_embd = n_embd
|
||||||
json_config = json.loads(reader.read())
|
self.n_layer = n_layer
|
||||||
for key, value in json_config.items():
|
self.n_head = n_head
|
||||||
self.__dict__[key] = value
|
self.afn = afn
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
self.resid_pdrop = resid_pdrop
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.embd_pdrop = embd_pdrop
|
||||||
self.n_ctx = n_ctx
|
self.attn_pdrop = attn_pdrop
|
||||||
self.n_positions = n_positions
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.n_embd = n_embd
|
self.initializer_range = initializer_range
|
||||||
self.n_layer = n_layer
|
self.predict_special_tokens = predict_special_tokens
|
||||||
self.n_head = n_head
|
self.summary_type = summary_type
|
||||||
self.afn = afn
|
self.summary_use_proj = summary_use_proj
|
||||||
self.resid_pdrop = resid_pdrop
|
self.summary_activation = summary_activation
|
||||||
self.embd_pdrop = embd_pdrop
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.attn_pdrop = attn_pdrop
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.predict_special_tokens = predict_special_tokens
|
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
|||||||
108
transformers/configuration_t5.py
Normal file
108
transformers/configuration_t5.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2010, The T5 Authors and HuggingFace Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" T5 model configuration """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import six
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
|
||||||
|
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
|
||||||
|
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
|
||||||
|
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
|
||||||
|
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class T5Config(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
:class:`~transformers.T5Config` is the configuration class to store the configuration of a
|
||||||
|
`T5Model`.
|
||||||
|
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
|
||||||
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||||
|
layer in the Transformer encoder.
|
||||||
|
hidden_act: The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
|
||||||
|
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
max_position_embeddings: The maximum sequence length that this model might
|
||||||
|
ever be used with. Typically set this to something large just in case
|
||||||
|
(e.g., 512 or 1024 or 2048).
|
||||||
|
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||||
|
`T5Model`.
|
||||||
|
initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
|
||||||
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size=32128,
|
||||||
|
n_positions=512,
|
||||||
|
d_model=512,
|
||||||
|
d_kv=64,
|
||||||
|
d_ff=2048,
|
||||||
|
num_layers=6,
|
||||||
|
num_heads=8,
|
||||||
|
relative_attention_num_buckets=32,
|
||||||
|
dropout_rate=0.1,
|
||||||
|
layer_norm_epsilon=1e-6,
|
||||||
|
initializer_factor=1.0,
|
||||||
|
**kwargs):
|
||||||
|
super(T5Config, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.n_positions = n_positions
|
||||||
|
self.d_model = d_model
|
||||||
|
self.d_kv = d_kv
|
||||||
|
self.d_ff = d_ff
|
||||||
|
self.num_layers = num_layers
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.relative_attention_num_buckets = relative_attention_num_buckets
|
||||||
|
self.dropout_rate = dropout_rate
|
||||||
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
self.initializer_factor = initializer_factor
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_position_embeddings(self):
|
||||||
|
return self.n_positions
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hidden_size(self):
|
||||||
|
return self.d_model
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_attention_heads(self):
|
||||||
|
return self.num_heads
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_hidden_layers(self):
|
||||||
|
return self.num_layers
|
||||||
@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
||||||
cutoffs: cutoffs for the adaptive softmax
|
cutoffs: cutoffs for the adaptive softmax
|
||||||
d_model: Dimensionality of the model's hidden states.
|
d_model: Dimensionality of the model's hidden states.
|
||||||
d_embed: Dimensionality of the embeddings
|
d_embed: Dimensionality of the embeddings
|
||||||
@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=267735,
|
vocab_size=267735,
|
||||||
cutoffs=[20000, 40000, 200000],
|
cutoffs=[20000, 40000, 200000],
|
||||||
d_model=1024,
|
d_model=1024,
|
||||||
d_embed=1024,
|
d_embed=1024,
|
||||||
@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
"""Constructs TransfoXLConfig.
|
"""Constructs TransfoXLConfig.
|
||||||
"""
|
"""
|
||||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||||
self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
|
self.vocab_size = vocab_size
|
||||||
self.cutoffs = []
|
self.cutoffs = []
|
||||||
self.cutoffs.extend(cutoffs)
|
self.cutoffs.extend(cutoffs)
|
||||||
self.tie_weight = tie_weight
|
self.tie_weight = tie_weight
|
||||||
@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
self.init_std = init_std
|
self.init_std = init_std
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif not isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
return self.tgt_len + self.ext_len + self.mem_len
|
return self.tgt_len + self.ext_len + self.mem_len
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def n_token(self): # Backward compatibility
|
||||||
return self.n_token
|
return self.vocab_size
|
||||||
|
|
||||||
@vocab_size.setter
|
@n_token.setter
|
||||||
def vocab_size(self, value):
|
def n_token(self, value): # Backward compatibility
|
||||||
self.n_token = value
|
self.vocab_size = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
from .file_utils import cached_path, CONFIG_NAME
|
from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -49,8 +49,7 @@ class PretrainedConfig(object):
|
|||||||
pretrained_config_archive_map = {}
|
pretrained_config_archive_map = {}
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
# Attributes with defaults
|
||||||
self.num_labels = kwargs.pop('num_labels', 2)
|
|
||||||
self.output_attentions = kwargs.pop('output_attentions', False)
|
self.output_attentions = kwargs.pop('output_attentions', False)
|
||||||
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
||||||
self.output_past = kwargs.pop('output_past', True) # Not used by all models
|
self.output_past = kwargs.pop('output_past', True) # Not used by all models
|
||||||
@@ -61,6 +60,22 @@ class PretrainedConfig(object):
|
|||||||
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
|
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
|
||||||
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
|
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
|
||||||
|
|
||||||
|
# Fine-tuning task arguments
|
||||||
|
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
||||||
|
self.num_labels = kwargs.pop('num_labels', 2)
|
||||||
|
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
|
||||||
|
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
|
||||||
|
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
|
||||||
|
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
|
||||||
|
|
||||||
|
# Additional attributes without default values
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
try:
|
||||||
|
setattr(self, key, value)
|
||||||
|
except AttributeError as err:
|
||||||
|
logger.error("Can't set {} with value {} for {}".format(key, value, self))
|
||||||
|
raise err
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
""" Save a configuration object to the directory `save_directory`, so that it
|
""" Save a configuration object to the directory `save_directory`, so that it
|
||||||
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
|
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
|
||||||
@@ -81,6 +96,7 @@ class PretrainedConfig(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
@@ -133,12 +149,18 @@ class PretrainedConfig(object):
|
|||||||
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
|
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
|
||||||
elif os.path.isdir(pretrained_model_name_or_path):
|
elif os.path.isdir(pretrained_model_name_or_path):
|
||||||
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
||||||
else:
|
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||||
config_file = pretrained_model_name_or_path
|
config_file = pretrained_model_name_or_path
|
||||||
# redirect to the cache, if necessary
|
else:
|
||||||
|
config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Load from URL or cache if already cached
|
||||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
|
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
|
||||||
proxies=proxies, resume_download=resume_download)
|
proxies=proxies, resume_download=resume_download)
|
||||||
|
# Load config
|
||||||
|
config = cls.from_json_file(resolved_config_file)
|
||||||
|
|
||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||||
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
||||||
@@ -152,15 +174,18 @@ class PretrainedConfig(object):
|
|||||||
config_file, CONFIG_NAME)
|
config_file, CONFIG_NAME)
|
||||||
raise EnvironmentError(msg)
|
raise EnvironmentError(msg)
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
msg = "Couldn't reach server at '{}' to download configuration file or " \
|
||||||
|
"configuration file is not a valid JSON file. " \
|
||||||
|
"Please check network or file content here: {}.".format(config_file, resolved_config_file)
|
||||||
|
raise EnvironmentError(msg)
|
||||||
|
|
||||||
if resolved_config_file == config_file:
|
if resolved_config_file == config_file:
|
||||||
logger.info("loading configuration file {}".format(config_file))
|
logger.info("loading configuration file {}".format(config_file))
|
||||||
else:
|
else:
|
||||||
logger.info("loading configuration file {} from cache at {}".format(
|
logger.info("loading configuration file {} from cache at {}".format(
|
||||||
config_file, resolved_config_file))
|
config_file, resolved_config_file))
|
||||||
|
|
||||||
# Load config
|
|
||||||
config = cls.from_json_file(resolved_config_file)
|
|
||||||
|
|
||||||
if hasattr(config, 'pruned_heads'):
|
if hasattr(config, 'pruned_heads'):
|
||||||
config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
|
config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
|
||||||
|
|
||||||
@@ -182,17 +207,15 @@ class PretrainedConfig(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, json_object):
|
def from_dict(cls, json_object):
|
||||||
"""Constructs a `Config` from a Python dictionary of parameters."""
|
"""Constructs a `Config` from a Python dictionary of parameters."""
|
||||||
config = cls(vocab_size_or_config_json_file=-1)
|
return cls(**json_object)
|
||||||
for key, value in json_object.items():
|
|
||||||
setattr(config, key, value)
|
|
||||||
return config
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_json_file(cls, json_file):
|
def from_json_file(cls, json_file):
|
||||||
"""Constructs a `BertConfig` from a json file of parameters."""
|
"""Constructs a `Config` from a json file of parameters."""
|
||||||
with open(json_file, "r", encoding='utf-8') as reader:
|
with open(json_file, "r", encoding='utf-8') as reader:
|
||||||
text = reader.read()
|
text = reader.read()
|
||||||
return cls.from_dict(json.loads(text))
|
dict_obj = json.loads(text)
|
||||||
|
return cls(**dict_obj)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return self.__dict__ == other.__dict__
|
return self.__dict__ == other.__dict__
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a `XLMModel`.
|
"""Configuration class to store the configuration of a `XLMModel`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
|
vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
d_model: Size of the encoder layers and the pooler layer.
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
n_head: Number of attention heads for each attention layer in
|
n_head: Number of attention heads for each attention layer in
|
||||||
@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30145,
|
vocab_size=30145,
|
||||||
emb_dim=2048,
|
emb_dim=2048,
|
||||||
n_layers=12,
|
n_layers=12,
|
||||||
n_heads=16,
|
n_heads=16,
|
||||||
@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
unk_index=3,
|
unk_index=3,
|
||||||
mask_index=5,
|
mask_index=5,
|
||||||
is_encoder=True,
|
is_encoder=True,
|
||||||
|
|
||||||
finetuning_task=None,
|
|
||||||
num_labels=2,
|
|
||||||
summary_type='first',
|
summary_type='first',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
@@ -117,56 +114,46 @@ class XLMConfig(PretrainedConfig):
|
|||||||
"""Constructs XLMConfig.
|
"""Constructs XLMConfig.
|
||||||
"""
|
"""
|
||||||
super(XLMConfig, self).__init__(**kwargs)
|
super(XLMConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.emb_dim = emb_dim
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.n_heads = n_heads
|
||||||
|
self.dropout = dropout
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.gelu_activation = gelu_activation
|
||||||
|
self.sinusoidal_embeddings = sinusoidal_embeddings
|
||||||
|
self.causal = causal
|
||||||
|
self.asm = asm
|
||||||
|
self.n_langs = n_langs
|
||||||
|
self.use_lang_emb = use_lang_emb
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.bos_index = bos_index
|
||||||
|
self.eos_index = eos_index
|
||||||
|
self.pad_index = pad_index
|
||||||
|
self.unk_index = unk_index
|
||||||
|
self.mask_index = mask_index
|
||||||
|
self.is_encoder = is_encoder
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.embed_init_std = embed_init_std
|
||||||
|
self.init_std = init_std
|
||||||
|
self.summary_type = summary_type
|
||||||
|
self.summary_use_proj = summary_use_proj
|
||||||
|
self.summary_activation = summary_activation
|
||||||
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
|
self.summary_first_dropout = summary_first_dropout
|
||||||
|
self.start_n_top = start_n_top
|
||||||
|
self.end_n_top = end_n_top
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
if "n_words" in kwargs:
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.n_words = kwargs["n_words"]
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.n_words = vocab_size_or_config_json_file
|
|
||||||
self.emb_dim = emb_dim
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.dropout = dropout
|
|
||||||
self.attention_dropout = attention_dropout
|
|
||||||
self.gelu_activation = gelu_activation
|
|
||||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
|
||||||
self.causal = causal
|
|
||||||
self.asm = asm
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.use_lang_emb = use_lang_emb
|
|
||||||
self.layer_norm_eps = layer_norm_eps
|
|
||||||
self.bos_index = bos_index
|
|
||||||
self.eos_index = eos_index
|
|
||||||
self.pad_index = pad_index
|
|
||||||
self.unk_index = unk_index
|
|
||||||
self.mask_index = mask_index
|
|
||||||
self.is_encoder = is_encoder
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.embed_init_std = embed_init_std
|
|
||||||
self.init_std = init_std
|
|
||||||
self.finetuning_task = finetuning_task
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.start_n_top = start_n_top
|
|
||||||
self.end_n_top = end_n_top
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def n_words(self): # For backward compatibility
|
||||||
return self.n_words
|
return self.vocab_size
|
||||||
|
|
||||||
@vocab_size.setter
|
@n_words.setter
|
||||||
def vocab_size(self, value):
|
def n_words(self, value): # For backward compatibility
|
||||||
self.n_words = value
|
self.vocab_size = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
"""Configuration class to store the configuration of a ``XLNetModel``.
|
"""Configuration class to store the configuration of a ``XLNetModel``.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
d_model: Size of the encoder layers and the pooler layer.
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
n_head: Number of attention heads for each attention layer in
|
n_head: Number of attention heads for each attention layer in
|
||||||
@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=32000,
|
vocab_size=32000,
|
||||||
d_model=1024,
|
d_model=1024,
|
||||||
n_layer=24,
|
n_layer=24,
|
||||||
n_head=16,
|
n_head=16,
|
||||||
d_inner=4096,
|
d_inner=4096,
|
||||||
max_position_embeddings=512,
|
|
||||||
ff_activation="gelu",
|
ff_activation="gelu",
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
attn_type="bi",
|
attn_type="bi",
|
||||||
|
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
|
|
||||||
dropout=0.1,
|
dropout=0.1,
|
||||||
mem_len=None,
|
mem_len=None,
|
||||||
reuse_len=None,
|
reuse_len=None,
|
||||||
bi_data=False,
|
bi_data=False,
|
||||||
clamp_len=-1,
|
clamp_len=-1,
|
||||||
same_length=False,
|
same_length=False,
|
||||||
|
|
||||||
finetuning_task=None,
|
|
||||||
num_labels=2,
|
|
||||||
summary_type='last',
|
summary_type='last',
|
||||||
summary_use_proj=True,
|
summary_use_proj=True,
|
||||||
summary_activation='tanh',
|
summary_activation='tanh',
|
||||||
@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
"""Constructs XLNetConfig.
|
"""Constructs XLNetConfig.
|
||||||
"""
|
"""
|
||||||
super(XLNetConfig, self).__init__(**kwargs)
|
super(XLNetConfig, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.d_model = d_model
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
assert d_model % n_head == 0
|
||||||
|
self.d_head = d_model // n_head
|
||||||
|
self.ff_activation = ff_activation
|
||||||
|
self.d_inner = d_inner
|
||||||
|
self.untie_r = untie_r
|
||||||
|
self.attn_type = attn_type
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
self.initializer_range = initializer_range
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
self.layer_norm_eps = layer_norm_eps
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
setattr(config, key, value)
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.n_token = vocab_size_or_config_json_file
|
|
||||||
self.d_model = d_model
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
assert d_model % n_head == 0
|
|
||||||
self.d_head = d_model // n_head
|
|
||||||
self.ff_activation = ff_activation
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.attn_type = attn_type
|
|
||||||
|
|
||||||
self.initializer_range = initializer_range
|
self.dropout = dropout
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.mem_len = mem_len
|
||||||
|
self.reuse_len = reuse_len
|
||||||
|
self.bi_data = bi_data
|
||||||
|
self.clamp_len = clamp_len
|
||||||
|
self.same_length = same_length
|
||||||
|
|
||||||
self.dropout = dropout
|
self.summary_type = summary_type
|
||||||
self.mem_len = mem_len
|
self.summary_use_proj = summary_use_proj
|
||||||
self.reuse_len = reuse_len
|
self.summary_activation = summary_activation
|
||||||
self.bi_data = bi_data
|
self.summary_last_dropout = summary_last_dropout
|
||||||
self.clamp_len = clamp_len
|
self.start_n_top = start_n_top
|
||||||
self.same_length = same_length
|
self.end_n_top = end_n_top
|
||||||
|
|
||||||
self.finetuning_task = finetuning_task
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_last_dropout = summary_last_dropout
|
|
||||||
self.start_n_top = start_n_top
|
|
||||||
self.end_n_top = end_n_top
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def n_token(self): # Backward compatibility
|
||||||
return self.n_token
|
return self.vocab_size
|
||||||
|
|
||||||
@vocab_size.setter
|
@n_token.setter
|
||||||
def vocab_size(self, value):
|
def n_token(self, value): # Backward compatibility
|
||||||
self.n_token = value
|
self.vocab_size = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
|
|||||||
@@ -34,7 +34,8 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
|
|||||||
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
@@ -48,7 +49,8 @@ if is_torch_available():
|
|||||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
else:
|
||||||
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
@@ -59,7 +61,8 @@ else:
|
|||||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
||||||
None, None, None, None,
|
None, None, None, None,
|
||||||
None, None,
|
None, None,
|
||||||
None, None,
|
None, None,
|
||||||
@@ -69,6 +72,7 @@ else:
|
|||||||
None, None, None,
|
None, None, None,
|
||||||
None, None, None,
|
None, None, None,
|
||||||
None, None,
|
None, None,
|
||||||
|
None, None,
|
||||||
None, None)
|
None, None)
|
||||||
|
|
||||||
|
|
||||||
@@ -90,7 +94,8 @@ MODEL_CLASSES = {
|
|||||||
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
}
|
}
|
||||||
|
|
||||||
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
|
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
|
||||||
@@ -115,23 +120,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
|
|||||||
tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
|
tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
|
||||||
|
|
||||||
if compare_with_pt_model:
|
if compare_with_pt_model:
|
||||||
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network
|
||||||
tf_inputs = tf.constant(inputs_list)
|
|
||||||
tfo = tf_model(tf_inputs, training=False) # build the network
|
|
||||||
|
|
||||||
pt_model = pt_model_class.from_pretrained(None,
|
state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
|
||||||
|
pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
|
||||||
config=config,
|
config=config,
|
||||||
state_dict=torch.load(pytorch_checkpoint_path,
|
state_dict=state_dict)
|
||||||
map_location='cpu'))
|
|
||||||
pt_inputs = torch.tensor(inputs_list)
|
|
||||||
with torch.no_grad():
|
|
||||||
pto = pt_model(pt_inputs)
|
|
||||||
|
|
||||||
np_pt = pto[0].detach().numpy()
|
with torch.no_grad():
|
||||||
|
pto = pt_model(**pt_model.dummy_inputs)
|
||||||
|
|
||||||
|
np_pt = pto[0].numpy()
|
||||||
np_tf = tfo[0].numpy()
|
np_tf = tfo[0].numpy()
|
||||||
diff = np.amax(np.abs(np_pt - np_tf))
|
diff = np.amax(np.abs(np_pt - np_tf))
|
||||||
print("Max absolute difference between models outputs {}".format(diff))
|
print("Max absolute difference between models outputs {}".format(diff))
|
||||||
assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
|
assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff)
|
||||||
|
|
||||||
# Save pytorch-model
|
# Save pytorch-model
|
||||||
print("Save TensorFlow model to {}".format(tf_dump_path))
|
print("Save TensorFlow model to {}".format(tf_dump_path))
|
||||||
@@ -139,7 +142,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
|
|||||||
|
|
||||||
|
|
||||||
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
|
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
|
||||||
compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
|
compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
|
||||||
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
|
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
|
||||||
|
|
||||||
if args_model_type is None:
|
if args_model_type is None:
|
||||||
@@ -187,13 +190,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
|
|||||||
|
|
||||||
if os.path.isfile(model_shortcut_name):
|
if os.path.isfile(model_shortcut_name):
|
||||||
model_shortcut_name = 'converted_model'
|
model_shortcut_name = 'converted_model'
|
||||||
|
|
||||||
convert_pt_checkpoint_to_tf(model_type=model_type,
|
convert_pt_checkpoint_to_tf(model_type=model_type,
|
||||||
pytorch_checkpoint_path=model_file,
|
pytorch_checkpoint_path=model_file,
|
||||||
config_file=config_file,
|
config_file=config_file,
|
||||||
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
|
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
|
||||||
compare_with_pt_model=compare_with_pt_model)
|
compare_with_pt_model=compare_with_pt_model)
|
||||||
os.remove(config_file)
|
if remove_cached_files:
|
||||||
os.remove(model_file)
|
os.remove(config_file)
|
||||||
|
os.remove(model_file)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -226,6 +231,9 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--use_cached_models",
|
parser.add_argument("--use_cached_models",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help = "Use cached models if possible instead of updating to latest checkpoint versions.")
|
help = "Use cached models if possible instead of updating to latest checkpoint versions.")
|
||||||
|
parser.add_argument("--remove_cached_files",
|
||||||
|
action='store_true',
|
||||||
|
help = "Remove pytorch models after conversion (save memory when converting in batches).")
|
||||||
parser.add_argument("--only_convert_finetuned_models",
|
parser.add_argument("--only_convert_finetuned_models",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help = "Only convert finetuned models.")
|
help = "Only convert finetuned models.")
|
||||||
@@ -245,4 +253,5 @@ if __name__ == "__main__":
|
|||||||
config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
|
config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
|
||||||
compare_with_pt_model=args.compare_with_pt_model,
|
compare_with_pt_model=args.compare_with_pt_model,
|
||||||
use_cached_models=args.use_cached_models,
|
use_cached_models=args.use_cached_models,
|
||||||
|
remove_cached_files=args.remove_cached_files,
|
||||||
only_convert_finetuned_models=args.only_convert_finetuned_models)
|
only_convert_finetuned_models=args.only_convert_finetuned_models)
|
||||||
|
|||||||
@@ -20,6 +20,13 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
import fairseq
|
||||||
|
from packaging import version
|
||||||
|
|
||||||
|
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
|
||||||
|
raise Exception("requires fairseq >= 0.9.0")
|
||||||
|
|
||||||
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
||||||
from fairseq.modules import TransformerSentenceEncoderLayer
|
from fairseq.modules import TransformerSentenceEncoderLayer
|
||||||
@@ -45,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
"""
|
"""
|
||||||
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
|
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
|
||||||
roberta.eval() # disable dropout
|
roberta.eval() # disable dropout
|
||||||
|
roberta_sent_encoder = roberta.model.decoder.sentence_encoder
|
||||||
config = BertConfig(
|
config = BertConfig(
|
||||||
vocab_size_or_config_json_file=50265,
|
vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
|
||||||
hidden_size=roberta.args.encoder_embed_dim,
|
hidden_size=roberta.args.encoder_embed_dim,
|
||||||
num_hidden_layers=roberta.args.encoder_layers,
|
num_hidden_layers=roberta.args.encoder_layers,
|
||||||
num_attention_heads=roberta.args.encoder_attention_heads,
|
num_attention_heads=roberta.args.encoder_attention_heads,
|
||||||
@@ -64,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
|
|
||||||
# Now let's copy all the weights.
|
# Now let's copy all the weights.
|
||||||
# Embeddings
|
# Embeddings
|
||||||
roberta_sent_encoder = roberta.model.decoder.sentence_encoder
|
|
||||||
model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
|
model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
|
||||||
model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
|
model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
|
||||||
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
|
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
|
||||||
@@ -79,15 +86,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
### self attention
|
### self attention
|
||||||
self_attn: BertSelfAttention = layer.attention.self
|
self_attn: BertSelfAttention = layer.attention.self
|
||||||
assert(
|
assert(
|
||||||
roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size))
|
roberta_layer.self_attn.k_proj.weight.data.shape == \
|
||||||
|
roberta_layer.self_attn.q_proj.weight.data.shape == \
|
||||||
|
roberta_layer.self_attn.v_proj.weight.data.shape == \
|
||||||
|
torch.Size((config.hidden_size, config.hidden_size))
|
||||||
)
|
)
|
||||||
# we use three distinct linear layers so we split the source layer here.
|
|
||||||
self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :]
|
self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
|
||||||
self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size]
|
self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
|
||||||
self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :]
|
self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
|
||||||
self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size]
|
self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
|
||||||
self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :]
|
self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
|
||||||
self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:]
|
self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
|
||||||
|
|
||||||
### self-attention output
|
### self-attention output
|
||||||
self_output: BertSelfOutput = layer.attention.output
|
self_output: BertSelfOutput = layer.attention.output
|
||||||
@@ -151,6 +161,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
if not success:
|
if not success:
|
||||||
raise Exception("Something went wRoNg")
|
raise Exception("Something went wRoNg")
|
||||||
|
|
||||||
|
pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
|
||||||
print(f"Saving model to {pytorch_dump_folder_path}")
|
print(f"Saving model to {pytorch_dump_folder_path}")
|
||||||
model.save_pretrained(pytorch_dump_folder_path)
|
model.save_pretrained(pytorch_dump_folder_path)
|
||||||
|
|
||||||
|
|||||||
65
transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
Executable file
65
transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
Executable file
@@ -0,0 +1,65 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The T5 authors and HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Convert T5 checkpoint."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from transformers import T5Config, T5Model, load_tf_weights_in_t5
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
|
||||||
|
# Initialise PyTorch model
|
||||||
|
config = T5Config.from_json_file(config_file)
|
||||||
|
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||||
|
model = T5Model(config)
|
||||||
|
|
||||||
|
# Load weights from tf checkpoint
|
||||||
|
load_tf_weights_in_t5(model, config, tf_checkpoint_path)
|
||||||
|
|
||||||
|
# Save pytorch-model
|
||||||
|
print("Save PyTorch model to {}".format(pytorch_dump_path))
|
||||||
|
torch.save(model.state_dict(), pytorch_dump_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--tf_checkpoint_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the TensorFlow checkpoint path.")
|
||||||
|
parser.add_argument("--config_file",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "The config json file corresponding to the pre-trained T5 model. \n"
|
||||||
|
"This specifies the model architecture.")
|
||||||
|
parser.add_argument("--pytorch_dump_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the output PyTorch model.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
|
||||||
|
args.config_file,
|
||||||
|
args.pytorch_dump_path)
|
||||||
@@ -695,7 +695,12 @@ def compute_predictions_log_probs(
|
|||||||
tok_text = " ".join(tok_text.split())
|
tok_text = " ".join(tok_text.split())
|
||||||
orig_text = " ".join(orig_tokens)
|
orig_text = " ".join(orig_tokens)
|
||||||
|
|
||||||
final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
|
if hasattr(tokenizer, "do_lower_case"):
|
||||||
|
do_lower_case = tokenizer.do_lower_case
|
||||||
|
else:
|
||||||
|
do_lower_case = tokenizer.do_lowercase_and_remove_accent
|
||||||
|
|
||||||
|
final_text = get_final_text(tok_text, orig_text, do_lower_case,
|
||||||
verbose_logging)
|
verbose_logging)
|
||||||
|
|
||||||
if final_text in seen_predictions:
|
if final_text in seen_predictions:
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
|
|||||||
if is_tf_available() and is_tf_dataset:
|
if is_tf_available() and is_tf_dataset:
|
||||||
def gen():
|
def gen():
|
||||||
for ex in features:
|
for ex in features:
|
||||||
yield ({'input_ids': ex.input_ids,
|
yield ({'input_ids': ex.input_ids,
|
||||||
'attention_mask': ex.attention_mask,
|
'attention_mask': ex.attention_mask,
|
||||||
'token_type_ids': ex.token_type_ids},
|
'token_type_ids': ex.token_type_ids},
|
||||||
ex.label)
|
ex.label)
|
||||||
|
|||||||
@@ -18,19 +18,20 @@ if is_tf_available():
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
|
|
||||||
orig_answer_text):
|
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
|
||||||
"""Returns tokenized answer spans that better match the annotated answer."""
|
"""Returns tokenized answer spans that better match the annotated answer."""
|
||||||
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
|
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
|
||||||
|
|
||||||
for new_start in range(input_start, input_end + 1):
|
for new_start in range(input_start, input_end + 1):
|
||||||
for new_end in range(input_end, new_start - 1, -1):
|
for new_end in range(input_end, new_start - 1, -1):
|
||||||
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
|
text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
|
||||||
if text_span == tok_answer_text:
|
if text_span == tok_answer_text:
|
||||||
return (new_start, new_end)
|
return (new_start, new_end)
|
||||||
|
|
||||||
return (input_start, input_end)
|
return (input_start, input_end)
|
||||||
|
|
||||||
|
|
||||||
def _check_is_max_context(doc_spans, cur_span_index, position):
|
def _check_is_max_context(doc_spans, cur_span_index, position):
|
||||||
"""Check if this is the 'max context' doc span for the token."""
|
"""Check if this is the 'max context' doc span for the token."""
|
||||||
best_score = None
|
best_score = None
|
||||||
@@ -50,10 +51,11 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
|
|||||||
|
|
||||||
return cur_span_index == best_span_index
|
return cur_span_index == best_span_index
|
||||||
|
|
||||||
|
|
||||||
def _new_check_is_max_context(doc_spans, cur_span_index, position):
|
def _new_check_is_max_context(doc_spans, cur_span_index, position):
|
||||||
"""Check if this is the 'max context' doc span for the token."""
|
"""Check if this is the 'max context' doc span for the token."""
|
||||||
# if len(doc_spans) == 1:
|
# if len(doc_spans) == 1:
|
||||||
# return True
|
# return True
|
||||||
best_score = None
|
best_score = None
|
||||||
best_span_index = None
|
best_span_index = None
|
||||||
for (span_index, doc_span) in enumerate(doc_spans):
|
for (span_index, doc_span) in enumerate(doc_spans):
|
||||||
@@ -71,14 +73,16 @@ def _new_check_is_max_context(doc_spans, cur_span_index, position):
|
|||||||
|
|
||||||
return cur_span_index == best_span_index
|
return cur_span_index == best_span_index
|
||||||
|
|
||||||
|
|
||||||
def _is_whitespace(c):
|
def _is_whitespace(c):
|
||||||
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
|
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|
||||||
doc_stride, max_query_length, is_training,
|
def squad_convert_examples_to_features(
|
||||||
return_dataset=False):
|
examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Converts a list of examples into a list of features that can be directly given as input to a model.
|
Converts a list of examples into a list of features that can be directly given as input to a model.
|
||||||
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
|
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
|
||||||
@@ -112,24 +116,23 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Defining helper methods
|
# Defining helper methods
|
||||||
unique_id = 1000000000
|
unique_id = 1000000000
|
||||||
|
|
||||||
features = []
|
features = []
|
||||||
for (example_index, example) in enumerate(tqdm(examples)):
|
for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")):
|
||||||
if is_training and not example.is_impossible:
|
if is_training and not example.is_impossible:
|
||||||
# Get start and end position
|
# Get start and end position
|
||||||
start_position = example.start_position
|
start_position = example.start_position
|
||||||
end_position = example.end_position
|
end_position = example.end_position
|
||||||
|
|
||||||
# If the answer cannot be found in the text, then skip this example.
|
# If the answer cannot be found in the text, then skip this example.
|
||||||
actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
|
actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
|
||||||
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
|
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
|
||||||
if actual_text.find(cleaned_answer_text) == -1:
|
if actual_text.find(cleaned_answer_text) == -1:
|
||||||
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
|
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
tok_to_orig_index = []
|
tok_to_orig_index = []
|
||||||
orig_to_tok_index = []
|
orig_to_tok_index = []
|
||||||
all_doc_tokens = []
|
all_doc_tokens = []
|
||||||
@@ -140,7 +143,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
tok_to_orig_index.append(i)
|
tok_to_orig_index.append(i)
|
||||||
all_doc_tokens.append(sub_token)
|
all_doc_tokens.append(sub_token)
|
||||||
|
|
||||||
|
|
||||||
if is_training and not example.is_impossible:
|
if is_training and not example.is_impossible:
|
||||||
tok_start_position = orig_to_tok_index[example.start_position]
|
tok_start_position = orig_to_tok_index[example.start_position]
|
||||||
if example.end_position < len(example.doc_tokens) - 1:
|
if example.end_position < len(example.doc_tokens) - 1:
|
||||||
@@ -153,36 +155,41 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
)
|
)
|
||||||
|
|
||||||
spans = []
|
spans = []
|
||||||
|
|
||||||
truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
|
truncated_query = tokenizer.encode(
|
||||||
sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
|
example.question_text, add_special_tokens=False, max_length=max_query_length
|
||||||
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
|
)
|
||||||
|
sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
|
||||||
|
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
|
||||||
|
|
||||||
span_doc_tokens = all_doc_tokens
|
span_doc_tokens = all_doc_tokens
|
||||||
while len(spans) * doc_stride < len(all_doc_tokens):
|
while len(spans) * doc_stride < len(all_doc_tokens):
|
||||||
|
|
||||||
encoded_dict = tokenizer.encode_plus(
|
encoded_dict = tokenizer.encode_plus(
|
||||||
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
|
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
|
||||||
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
|
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
|
||||||
max_length=max_seq_length,
|
max_length=max_seq_length,
|
||||||
return_overflowing_tokens=True,
|
return_overflowing_tokens=True,
|
||||||
pad_to_max_length=True,
|
pad_to_max_length=True,
|
||||||
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
|
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
|
||||||
truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
|
truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
|
||||||
)
|
)
|
||||||
|
|
||||||
paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
|
paragraph_len = min(
|
||||||
|
len(all_doc_tokens) - len(spans) * doc_stride,
|
||||||
|
max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
if tokenizer.pad_token_id in encoded_dict['input_ids']:
|
if tokenizer.pad_token_id in encoded_dict["input_ids"]:
|
||||||
non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
|
non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
|
||||||
else:
|
else:
|
||||||
non_padded_ids = encoded_dict['input_ids']
|
non_padded_ids = encoded_dict["input_ids"]
|
||||||
|
|
||||||
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
|
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
|
||||||
|
|
||||||
token_to_orig_map = {}
|
token_to_orig_map = {}
|
||||||
for i in range(paragraph_len):
|
for i in range(paragraph_len):
|
||||||
index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
|
index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
|
||||||
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
|
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
|
||||||
|
|
||||||
encoded_dict["paragraph_len"] = paragraph_len
|
encoded_dict["paragraph_len"] = paragraph_len
|
||||||
@@ -202,16 +209,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
for doc_span_index in range(len(spans)):
|
for doc_span_index in range(len(spans)):
|
||||||
for j in range(spans[doc_span_index]["paragraph_len"]):
|
for j in range(spans[doc_span_index]["paragraph_len"]):
|
||||||
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
|
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
|
||||||
index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
|
index = (
|
||||||
|
j
|
||||||
|
if tokenizer.padding_side == "left"
|
||||||
|
else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
|
||||||
|
)
|
||||||
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
|
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
|
||||||
|
|
||||||
for span in spans:
|
for span in spans:
|
||||||
# Identify the position of the CLS token
|
# Identify the position of the CLS token
|
||||||
cls_index = span['input_ids'].index(tokenizer.cls_token_id)
|
cls_index = span["input_ids"].index(tokenizer.cls_token_id)
|
||||||
|
|
||||||
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
|
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
|
||||||
# Original TF implem also keep the classification token (set to 0) (not sure why...)
|
# Original TF implem also keep the classification token (set to 0) (not sure why...)
|
||||||
p_mask = np.array(span['token_type_ids'])
|
p_mask = np.array(span["token_type_ids"])
|
||||||
|
|
||||||
p_mask = np.minimum(p_mask, 1)
|
p_mask = np.minimum(p_mask, 1)
|
||||||
|
|
||||||
@@ -224,7 +235,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
# Set the CLS index to '0'
|
# Set the CLS index to '0'
|
||||||
p_mask[cls_index] = 0
|
p_mask[cls_index] = 0
|
||||||
|
|
||||||
|
|
||||||
span_is_impossible = example.is_impossible
|
span_is_impossible = example.is_impossible
|
||||||
start_position = 0
|
start_position = 0
|
||||||
end_position = 0
|
end_position = 0
|
||||||
@@ -247,55 +257,99 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
doc_offset = 0
|
doc_offset = 0
|
||||||
else:
|
else:
|
||||||
doc_offset = len(truncated_query) + sequence_added_tokens
|
doc_offset = len(truncated_query) + sequence_added_tokens
|
||||||
|
|
||||||
start_position = tok_start_position - doc_start + doc_offset
|
start_position = tok_start_position - doc_start + doc_offset
|
||||||
end_position = tok_end_position - doc_start + doc_offset
|
end_position = tok_end_position - doc_start + doc_offset
|
||||||
|
|
||||||
|
features.append(
|
||||||
features.append(SquadFeatures(
|
SquadFeatures(
|
||||||
span['input_ids'],
|
span["input_ids"],
|
||||||
span['attention_mask'],
|
span["attention_mask"],
|
||||||
span['token_type_ids'],
|
span["token_type_ids"],
|
||||||
cls_index,
|
cls_index,
|
||||||
p_mask.tolist(),
|
p_mask.tolist(),
|
||||||
|
example_index=example_index,
|
||||||
example_index=example_index,
|
unique_id=unique_id,
|
||||||
unique_id=unique_id,
|
paragraph_len=span["paragraph_len"],
|
||||||
paragraph_len=span['paragraph_len'],
|
token_is_max_context=span["token_is_max_context"],
|
||||||
token_is_max_context=span["token_is_max_context"],
|
tokens=span["tokens"],
|
||||||
tokens=span["tokens"],
|
token_to_orig_map=span["token_to_orig_map"],
|
||||||
token_to_orig_map=span["token_to_orig_map"],
|
start_position=start_position,
|
||||||
|
end_position=end_position,
|
||||||
start_position=start_position,
|
)
|
||||||
end_position=end_position
|
)
|
||||||
))
|
|
||||||
|
|
||||||
unique_id += 1
|
unique_id += 1
|
||||||
|
|
||||||
if return_dataset == 'pt':
|
if return_dataset == "pt":
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
raise ImportError("Pytorch must be installed to return a pytorch dataset.")
|
raise ImportError("Pytorch must be installed to return a pytorch dataset.")
|
||||||
|
|
||||||
# Convert to Tensors and build dataset
|
# Convert to Tensors and build dataset
|
||||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||||
all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||||
all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||||
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||||
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||||
|
|
||||||
if not is_training:
|
if not is_training:
|
||||||
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
||||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
dataset = TensorDataset(
|
||||||
all_example_index, all_cls_index, all_p_mask)
|
all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||||
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
dataset = TensorDataset(
|
||||||
all_start_positions, all_end_positions,
|
all_input_ids,
|
||||||
all_cls_index, all_p_mask)
|
all_attention_masks,
|
||||||
|
all_token_type_ids,
|
||||||
|
all_start_positions,
|
||||||
|
all_end_positions,
|
||||||
|
all_cls_index,
|
||||||
|
all_p_mask,
|
||||||
|
)
|
||||||
|
|
||||||
return features, dataset
|
return features, dataset
|
||||||
|
elif return_dataset == "tf":
|
||||||
|
if not is_tf_available():
|
||||||
|
raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.")
|
||||||
|
|
||||||
|
def gen():
|
||||||
|
for ex in features:
|
||||||
|
yield (
|
||||||
|
{
|
||||||
|
"input_ids": ex.input_ids,
|
||||||
|
"attention_mask": ex.attention_mask,
|
||||||
|
"token_type_ids": ex.token_type_ids,
|
||||||
|
}, {
|
||||||
|
"start_position": ex.start_position,
|
||||||
|
"end_position": ex.end_position,
|
||||||
|
"cls_index": ex.cls_index,
|
||||||
|
"p_mask": ex.p_mask,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return tf.data.Dataset.from_generator(
|
||||||
|
gen,
|
||||||
|
(
|
||||||
|
{"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
|
||||||
|
{"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"input_ids": tf.TensorShape([None]),
|
||||||
|
"attention_mask": tf.TensorShape([None]),
|
||||||
|
"token_type_ids": tf.TensorShape([None]),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"start_position": tf.TensorShape([]),
|
||||||
|
"end_position": tf.TensorShape([]),
|
||||||
|
"cls_index": tf.TensorShape([]),
|
||||||
|
"p_mask": tf.TensorShape([None]),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
@@ -305,31 +359,32 @@ class SquadProcessor(DataProcessor):
|
|||||||
Processor for the SQuAD data set.
|
Processor for the SQuAD data set.
|
||||||
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
|
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
train_file = None
|
train_file = None
|
||||||
dev_file = None
|
dev_file = None
|
||||||
|
|
||||||
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
|
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
|
||||||
if not evaluate:
|
if not evaluate:
|
||||||
answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
|
answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
|
||||||
answer_start = tensor_dict['answers']['answer_start'][0].numpy()
|
answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
|
||||||
answers = []
|
answers = []
|
||||||
else:
|
else:
|
||||||
answers = [{
|
answers = [
|
||||||
"answer_start": start.numpy(),
|
{"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
|
||||||
"text": text.numpy().decode('utf-8')
|
for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
|
||||||
} for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
|
]
|
||||||
|
|
||||||
answer = None
|
answer = None
|
||||||
answer_start = None
|
answer_start = None
|
||||||
|
|
||||||
return SquadExample(
|
return SquadExample(
|
||||||
qas_id=tensor_dict['id'].numpy().decode("utf-8"),
|
qas_id=tensor_dict["id"].numpy().decode("utf-8"),
|
||||||
question_text=tensor_dict['question'].numpy().decode('utf-8'),
|
question_text=tensor_dict["question"].numpy().decode("utf-8"),
|
||||||
context_text=tensor_dict['context'].numpy().decode('utf-8'),
|
context_text=tensor_dict["context"].numpy().decode("utf-8"),
|
||||||
answer_text=answer,
|
answer_text=answer,
|
||||||
start_position_character=answer_start,
|
start_position_character=answer_start,
|
||||||
title=tensor_dict['title'].numpy().decode('utf-8'),
|
title=tensor_dict["title"].numpy().decode("utf-8"),
|
||||||
answers=answers
|
answers=answers,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_examples_from_dataset(self, dataset, evaluate=False):
|
def get_examples_from_dataset(self, dataset, evaluate=False):
|
||||||
@@ -359,7 +414,7 @@ class SquadProcessor(DataProcessor):
|
|||||||
|
|
||||||
examples = []
|
examples = []
|
||||||
for tensor_dict in tqdm(dataset):
|
for tensor_dict in tqdm(dataset):
|
||||||
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
|
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
|
||||||
|
|
||||||
return examples
|
return examples
|
||||||
|
|
||||||
@@ -373,10 +428,15 @@ class SquadProcessor(DataProcessor):
|
|||||||
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
if data_dir is None:
|
||||||
|
data_dir = ""
|
||||||
|
|
||||||
if self.train_file is None:
|
if self.train_file is None:
|
||||||
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||||
|
|
||||||
with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
|
with open(
|
||||||
|
os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
|
||||||
|
) as reader:
|
||||||
input_data = json.load(reader)["data"]
|
input_data = json.load(reader)["data"]
|
||||||
return self._create_examples(input_data, "train")
|
return self._create_examples(input_data, "train")
|
||||||
|
|
||||||
@@ -389,10 +449,15 @@ class SquadProcessor(DataProcessor):
|
|||||||
filename: None by default, specify this if the evaluation file has a different name than the original one
|
filename: None by default, specify this if the evaluation file has a different name than the original one
|
||||||
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||||
"""
|
"""
|
||||||
|
if data_dir is None:
|
||||||
|
data_dir = ""
|
||||||
|
|
||||||
if self.dev_file is None:
|
if self.dev_file is None:
|
||||||
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||||
|
|
||||||
with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
|
with open(
|
||||||
|
os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
|
||||||
|
) as reader:
|
||||||
input_data = json.load(reader)["data"]
|
input_data = json.load(reader)["data"]
|
||||||
return self._create_examples(input_data, "dev")
|
return self._create_examples(input_data, "dev")
|
||||||
|
|
||||||
@@ -400,7 +465,7 @@ class SquadProcessor(DataProcessor):
|
|||||||
is_training = set_type == "train"
|
is_training = set_type == "train"
|
||||||
examples = []
|
examples = []
|
||||||
for entry in tqdm(input_data):
|
for entry in tqdm(input_data):
|
||||||
title = entry['title']
|
title = entry["title"]
|
||||||
for paragraph in entry["paragraphs"]:
|
for paragraph in entry["paragraphs"]:
|
||||||
context_text = paragraph["context"]
|
context_text = paragraph["context"]
|
||||||
for qa in paragraph["qas"]:
|
for qa in paragraph["qas"]:
|
||||||
@@ -409,7 +474,7 @@ class SquadProcessor(DataProcessor):
|
|||||||
start_position_character = None
|
start_position_character = None
|
||||||
answer_text = None
|
answer_text = None
|
||||||
answers = []
|
answers = []
|
||||||
|
|
||||||
if "is_impossible" in qa:
|
if "is_impossible" in qa:
|
||||||
is_impossible = qa["is_impossible"]
|
is_impossible = qa["is_impossible"]
|
||||||
else:
|
else:
|
||||||
@@ -418,8 +483,8 @@ class SquadProcessor(DataProcessor):
|
|||||||
if not is_impossible:
|
if not is_impossible:
|
||||||
if is_training:
|
if is_training:
|
||||||
answer = qa["answers"][0]
|
answer = qa["answers"][0]
|
||||||
answer_text = answer['text']
|
answer_text = answer["text"]
|
||||||
start_position_character = answer['answer_start']
|
start_position_character = answer["answer_start"]
|
||||||
else:
|
else:
|
||||||
answers = qa["answers"]
|
answers = qa["answers"]
|
||||||
|
|
||||||
@@ -431,12 +496,13 @@ class SquadProcessor(DataProcessor):
|
|||||||
start_position_character=start_position_character,
|
start_position_character=start_position_character,
|
||||||
title=title,
|
title=title,
|
||||||
is_impossible=is_impossible,
|
is_impossible=is_impossible,
|
||||||
answers=answers
|
answers=answers,
|
||||||
)
|
)
|
||||||
|
|
||||||
examples.append(example)
|
examples.append(example)
|
||||||
return examples
|
return examples
|
||||||
|
|
||||||
|
|
||||||
class SquadV1Processor(SquadProcessor):
|
class SquadV1Processor(SquadProcessor):
|
||||||
train_file = "train-v1.1.json"
|
train_file = "train-v1.1.json"
|
||||||
dev_file = "dev-v1.1.json"
|
dev_file = "dev-v1.1.json"
|
||||||
@@ -445,7 +511,7 @@ class SquadV1Processor(SquadProcessor):
|
|||||||
class SquadV2Processor(SquadProcessor):
|
class SquadV2Processor(SquadProcessor):
|
||||||
train_file = "train-v2.0.json"
|
train_file = "train-v2.0.json"
|
||||||
dev_file = "dev-v2.0.json"
|
dev_file = "dev-v2.0.json"
|
||||||
|
|
||||||
|
|
||||||
class SquadExample(object):
|
class SquadExample(object):
|
||||||
"""
|
"""
|
||||||
@@ -462,21 +528,23 @@ class SquadExample(object):
|
|||||||
is_impossible: False by default, set to True if the example has no possible answer.
|
is_impossible: False by default, set to True if the example has no possible answer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
qas_id,
|
self,
|
||||||
question_text,
|
qas_id,
|
||||||
context_text,
|
question_text,
|
||||||
answer_text,
|
context_text,
|
||||||
start_position_character,
|
answer_text,
|
||||||
title,
|
start_position_character,
|
||||||
answers=[],
|
title,
|
||||||
is_impossible=False):
|
answers=[],
|
||||||
|
is_impossible=False,
|
||||||
|
):
|
||||||
self.qas_id = qas_id
|
self.qas_id = qas_id
|
||||||
self.question_text = question_text
|
self.question_text = question_text
|
||||||
self.context_text = context_text
|
self.context_text = context_text
|
||||||
self.answer_text = answer_text
|
self.answer_text = answer_text
|
||||||
self.title = title
|
self.title = title
|
||||||
self.is_impossible = is_impossible
|
self.is_impossible = is_impossible
|
||||||
self.answers = answers
|
self.answers = answers
|
||||||
|
|
||||||
self.start_position, self.end_position = 0, 0
|
self.start_position, self.end_position = 0, 0
|
||||||
@@ -503,7 +571,9 @@ class SquadExample(object):
|
|||||||
# Start end end positions only has a value during evaluation.
|
# Start end end positions only has a value during evaluation.
|
||||||
if start_position_character is not None and not is_impossible:
|
if start_position_character is not None and not is_impossible:
|
||||||
self.start_position = char_to_word_offset[start_position_character]
|
self.start_position = char_to_word_offset[start_position_character]
|
||||||
self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
|
self.end_position = char_to_word_offset[
|
||||||
|
min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class SquadFeatures(object):
|
class SquadFeatures(object):
|
||||||
@@ -531,24 +601,23 @@ class SquadFeatures(object):
|
|||||||
end_position: end of the answer token index
|
end_position: end of the answer token index
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
input_ids,
|
self,
|
||||||
attention_mask,
|
input_ids,
|
||||||
token_type_ids,
|
attention_mask,
|
||||||
cls_index,
|
token_type_ids,
|
||||||
p_mask,
|
cls_index,
|
||||||
|
p_mask,
|
||||||
example_index,
|
example_index,
|
||||||
unique_id,
|
unique_id,
|
||||||
paragraph_len,
|
paragraph_len,
|
||||||
token_is_max_context,
|
token_is_max_context,
|
||||||
tokens,
|
tokens,
|
||||||
token_to_orig_map,
|
token_to_orig_map,
|
||||||
|
start_position,
|
||||||
start_position,
|
end_position,
|
||||||
end_position
|
):
|
||||||
):
|
self.input_ids = input_ids
|
||||||
self.input_ids = input_ids
|
|
||||||
self.attention_mask = attention_mask
|
self.attention_mask = attention_mask
|
||||||
self.token_type_ids = token_type_ids
|
self.token_type_ids = token_type_ids
|
||||||
self.cls_index = cls_index
|
self.cls_index = cls_index
|
||||||
@@ -574,12 +643,13 @@ class SquadResult(object):
|
|||||||
start_logits: The logits corresponding to the start of the answer
|
start_logits: The logits corresponding to the start of the answer
|
||||||
end_logits: The logits corresponding to the end of the answer
|
end_logits: The logits corresponding to the end of the answer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
|
def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
|
||||||
self.start_logits = start_logits
|
self.start_logits = start_logits
|
||||||
self.end_logits = end_logits
|
self.end_logits = end_logits
|
||||||
self.unique_id = unique_id
|
self.unique_id = unique_id
|
||||||
|
|
||||||
if start_top_index:
|
if start_top_index:
|
||||||
self.start_top_index = start_top_index
|
self.start_top_index = start_top_index
|
||||||
self.end_top_index = end_top_index
|
self.end_top_index = end_top_index
|
||||||
self.cls_logits = cls_logits
|
self.cls_logits = cls_logits
|
||||||
|
|||||||
@@ -21,11 +21,23 @@ import boto3
|
|||||||
from botocore.config import Config
|
from botocore.config import Config
|
||||||
from botocore.exceptions import ClientError
|
from botocore.exceptions import ClientError
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm.auto import tqdm
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.environ.setdefault('USE_TORCH', 'YES')
|
||||||
|
if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'):
|
||||||
|
import torch
|
||||||
|
_torch_available = True # pylint: disable=invalid-name
|
||||||
|
logger.info("PyTorch version {} available.".format(torch.__version__))
|
||||||
|
else:
|
||||||
|
logger.info("USE_TORCH override through env variable, disabling PyTorch")
|
||||||
|
_torch_available = False
|
||||||
|
except ImportError:
|
||||||
|
_torch_available = False # pylint: disable=invalid-name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.environ.setdefault('USE_TF', 'YES')
|
os.environ.setdefault('USE_TF', 'YES')
|
||||||
if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
|
if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
|
||||||
@@ -36,24 +48,9 @@ try:
|
|||||||
else:
|
else:
|
||||||
logger.info("USE_TF override through env variable, disabling Tensorflow")
|
logger.info("USE_TF override through env variable, disabling Tensorflow")
|
||||||
_tf_available = False
|
_tf_available = False
|
||||||
|
|
||||||
except (ImportError, AssertionError):
|
except (ImportError, AssertionError):
|
||||||
_tf_available = False # pylint: disable=invalid-name
|
_tf_available = False # pylint: disable=invalid-name
|
||||||
|
|
||||||
try:
|
|
||||||
os.environ.setdefault('USE_TORCH', 'YES')
|
|
||||||
if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'):
|
|
||||||
import torch
|
|
||||||
_torch_available = True # pylint: disable=invalid-name
|
|
||||||
logger.info("PyTorch version {} available.".format(torch.__version__))
|
|
||||||
|
|
||||||
else:
|
|
||||||
logger.info("USE_TORCH override through env variable, disabling PyTorch")
|
|
||||||
_torch_available = False
|
|
||||||
except ImportError:
|
|
||||||
_torch_available = False # pylint: disable=invalid-name
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from torch.hub import _get_torch_home
|
from torch.hub import _get_torch_home
|
||||||
torch_cache_home = _get_torch_home()
|
torch_cache_home = _get_torch_home()
|
||||||
@@ -84,6 +81,13 @@ WEIGHTS_NAME = "pytorch_model.bin"
|
|||||||
TF2_WEIGHTS_NAME = 'tf_model.h5'
|
TF2_WEIGHTS_NAME = 'tf_model.h5'
|
||||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||||
CONFIG_NAME = "config.json"
|
CONFIG_NAME = "config.json"
|
||||||
|
MODEL_CARD_NAME = "model_card.json"
|
||||||
|
|
||||||
|
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
||||||
|
DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
|
||||||
|
|
||||||
|
S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
|
||||||
|
|
||||||
|
|
||||||
def is_torch_available():
|
def is_torch_available():
|
||||||
return _torch_available
|
return _torch_available
|
||||||
@@ -116,6 +120,18 @@ else:
|
|||||||
return fn
|
return fn
|
||||||
return docstring_decorator
|
return docstring_decorator
|
||||||
|
|
||||||
|
|
||||||
|
def is_remote_url(url_or_filename):
|
||||||
|
parsed = urlparse(url_or_filename)
|
||||||
|
return parsed.scheme in ('http', 'https', 's3')
|
||||||
|
|
||||||
|
def hf_bucket_url(identifier, postfix=None):
|
||||||
|
if postfix is None:
|
||||||
|
return "/".join((S3_BUCKET_PREFIX, identifier))
|
||||||
|
else:
|
||||||
|
return "/".join((S3_BUCKET_PREFIX, identifier, postfix))
|
||||||
|
|
||||||
|
|
||||||
def url_to_filename(url, etag=None):
|
def url_to_filename(url, etag=None):
|
||||||
"""
|
"""
|
||||||
Convert `url` into a hashed filename in a repeatable way.
|
Convert `url` into a hashed filename in a repeatable way.
|
||||||
@@ -184,9 +200,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||||
cache_dir = str(cache_dir)
|
cache_dir = str(cache_dir)
|
||||||
|
|
||||||
parsed = urlparse(url_or_filename)
|
if is_remote_url(url_or_filename):
|
||||||
|
|
||||||
if parsed.scheme in ('http', 'https', 's3'):
|
|
||||||
# URL, so get it from the cache (downloading if necessary)
|
# URL, so get it from the cache (downloading if necessary)
|
||||||
return get_from_cache(url_or_filename, cache_dir=cache_dir,
|
return get_from_cache(url_or_filename, cache_dir=cache_dir,
|
||||||
force_download=force_download, proxies=proxies,
|
force_download=force_download, proxies=proxies,
|
||||||
@@ -194,7 +208,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
elif os.path.exists(url_or_filename):
|
elif os.path.exists(url_or_filename):
|
||||||
# File, and it exists.
|
# File, and it exists.
|
||||||
return url_or_filename
|
return url_or_filename
|
||||||
elif parsed.scheme == '':
|
elif urlparse(url_or_filename).scheme == '':
|
||||||
# File, but it doesn't exist.
|
# File, but it doesn't exist.
|
||||||
raise EnvironmentError("file {} not found".format(url_or_filename))
|
raise EnvironmentError("file {} not found".format(url_or_filename))
|
||||||
else:
|
else:
|
||||||
@@ -258,7 +272,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0):
|
|||||||
return
|
return
|
||||||
content_length = response.headers.get('Content-Length')
|
content_length = response.headers.get('Content-Length')
|
||||||
total = resume_size + int(content_length) if content_length is not None else None
|
total = resume_size + int(content_length) if content_length is not None else None
|
||||||
progress = tqdm(unit="B", total=total, initial=resume_size)
|
progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading")
|
||||||
for chunk in response.iter_content(chunk_size=1024):
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
if chunk: # filter out keep-alive new chunks
|
if chunk: # filter out keep-alive new chunks
|
||||||
progress.update(len(chunk))
|
progress.update(len(chunk))
|
||||||
|
|||||||
@@ -131,8 +131,9 @@ class HfApi:
|
|||||||
# the client still has to specify it when uploading the file.
|
# the client still has to specify it when uploading the file.
|
||||||
with open(filepath, "rb") as f:
|
with open(filepath, "rb") as f:
|
||||||
pf = TqdmProgressFileReader(f)
|
pf = TqdmProgressFileReader(f)
|
||||||
|
data = f if pf.total_size > 0 else ""
|
||||||
|
|
||||||
r = requests.put(urls.write, data=f, headers={
|
r = requests.put(urls.write, data=data, headers={
|
||||||
"content-type": urls.type,
|
"content-type": urls.type,
|
||||||
})
|
})
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|||||||
226
transformers/model_card.py
Normal file
226
transformers/model_card.py
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Configuration base class and utilities."""
|
||||||
|
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ModelCard(object):
|
||||||
|
r""" Model Card class.
|
||||||
|
Store model card as well as methods for loading/downloading/saving model cards.
|
||||||
|
|
||||||
|
Please read the following paper for details and explanation on the sections:
|
||||||
|
"Model Cards for Model Reporting"
|
||||||
|
by Margaret Mitchell, Simone Wu,
|
||||||
|
Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
|
||||||
|
Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
|
||||||
|
Link: https://arxiv.org/abs/1810.03993
|
||||||
|
|
||||||
|
Note:
|
||||||
|
A model card can be loaded and saved to disk.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
"""
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
# Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
|
||||||
|
self.model_details = kwargs.pop('model_details', {})
|
||||||
|
self.intended_use = kwargs.pop('intended_use', {})
|
||||||
|
self.factors = kwargs.pop('factors', {})
|
||||||
|
self.metrics = kwargs.pop('metrics', {})
|
||||||
|
self.evaluation_data = kwargs.pop('evaluation_data', {})
|
||||||
|
self.training_data = kwargs.pop('training_data', {})
|
||||||
|
self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
|
||||||
|
self.ethical_considerations = kwargs.pop('ethical_considerations', {})
|
||||||
|
self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
|
||||||
|
|
||||||
|
# Open additional attributes
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
try:
|
||||||
|
setattr(self, key, value)
|
||||||
|
except AttributeError as err:
|
||||||
|
logger.error("Can't set {} with value {} for {}".format(key, value, self))
|
||||||
|
raise err
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory_or_file):
|
||||||
|
""" Save a model card object to the directory or file `save_directory_or_file`.
|
||||||
|
"""
|
||||||
|
if os.path.isdir(save_directory_or_file):
|
||||||
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
|
output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
|
||||||
|
else:
|
||||||
|
output_model_card_file = save_directory_or_file
|
||||||
|
|
||||||
|
self.to_json_file(output_model_card_file)
|
||||||
|
logger.info("Model card saved in {}".format(output_model_card_file))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
|
r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
|
- a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/model_card.json``.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
card should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
|
||||||
|
|
||||||
|
- The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
|
||||||
|
- Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model card file and override the cached version if it exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
return_unused_kwargs: (`optional`) bool:
|
||||||
|
|
||||||
|
- If False, then this function returns just the final model card object.
|
||||||
|
- If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model_card = ModelCard.from_pretrained('bert-base-uncased') # Download model card from S3 and cache.
|
||||||
|
model_card = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model_card = ModelCard.from_pretrained('./test/saved_model/model_card.json')
|
||||||
|
model_card = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||||
|
|
||||||
|
"""
|
||||||
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
|
force_download = kwargs.pop('force_download', False)
|
||||||
|
resume_download = kwargs.pop('resume_download', False)
|
||||||
|
proxies = kwargs.pop('proxies', None)
|
||||||
|
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
||||||
|
|
||||||
|
if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
|
||||||
|
# For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json)
|
||||||
|
model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||||
|
model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
|
||||||
|
elif os.path.isdir(pretrained_model_name_or_path):
|
||||||
|
model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
|
||||||
|
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||||
|
model_card_file = pretrained_model_name_or_path
|
||||||
|
else:
|
||||||
|
model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load from URL or cache if already cached
|
||||||
|
resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download,
|
||||||
|
proxies=proxies, resume_download=resume_download)
|
||||||
|
if resolved_model_card_file == model_card_file:
|
||||||
|
logger.info("loading model card file {}".format(model_card_file))
|
||||||
|
else:
|
||||||
|
logger.info("loading model card file {} from cache at {}".format(
|
||||||
|
model_card_file, resolved_model_card_file))
|
||||||
|
# Load model card
|
||||||
|
model_card = cls.from_json_file(resolved_model_card_file)
|
||||||
|
|
||||||
|
except EnvironmentError:
|
||||||
|
if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
|
||||||
|
logger.warning("Couldn't reach server at '{}' to download model card file.".format(
|
||||||
|
model_card_file))
|
||||||
|
else:
|
||||||
|
logger.warning("Model name '{}' was not found in model name list ({}). " \
|
||||||
|
"We assumed '{}' was a path or url to a model card file named {} or " \
|
||||||
|
"a directory containing such a file but couldn't find any such file at this path or url.".format(
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
|
||||||
|
model_card_file, MODEL_CARD_NAME))
|
||||||
|
logger.warning("Creating an empty model card.")
|
||||||
|
|
||||||
|
# We fall back on creating an empty model card
|
||||||
|
model_card = cls()
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning("Couldn't reach server at '{}' to download model card file or "
|
||||||
|
"model card file is not a valid JSON file. "
|
||||||
|
"Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
|
||||||
|
logger.warning("Creating an empty model card.")
|
||||||
|
|
||||||
|
# We fall back on creating an empty model card
|
||||||
|
model_card = cls()
|
||||||
|
|
||||||
|
# Update model card with kwargs if needed
|
||||||
|
to_remove = []
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if hasattr(model_card, key):
|
||||||
|
setattr(model_card, key, value)
|
||||||
|
to_remove.append(key)
|
||||||
|
for key in to_remove:
|
||||||
|
kwargs.pop(key, None)
|
||||||
|
|
||||||
|
logger.info("Model card: %s", str(model_card))
|
||||||
|
if return_unused_kwargs:
|
||||||
|
return model_card, kwargs
|
||||||
|
else:
|
||||||
|
return model_card
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, json_object):
|
||||||
|
"""Constructs a `ModelCard` from a Python dictionary of parameters."""
|
||||||
|
return cls(**json_object)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json_file(cls, json_file):
|
||||||
|
"""Constructs a `ModelCard` from a json file of parameters."""
|
||||||
|
with open(json_file, "r", encoding='utf-8') as reader:
|
||||||
|
text = reader.read()
|
||||||
|
dict_obj = json.loads(text)
|
||||||
|
return cls(**dict_obj)
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.__dict__ == other.__dict__
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.to_json_string())
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""Serializes this instance to a Python dictionary."""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""Serializes this instance to a JSON string."""
|
||||||
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path):
|
||||||
|
""" Save this instance to a json file."""
|
||||||
|
with open(json_file_path, "w", encoding='utf-8') as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
@@ -23,21 +23,24 @@ from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRL
|
|||||||
TransfoXLConfig, XLMConfig, XLNetConfig)
|
TransfoXLConfig, XLMConfig, XLNetConfig)
|
||||||
|
|
||||||
from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
|
from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
|
||||||
BertForTokenClassification
|
BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
|
from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
|
from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
|
from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
|
from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
|
from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
|
||||||
XLNetForTokenClassification
|
XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
|
||||||
|
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
|
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
|
||||||
RobertaForTokenClassification
|
RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
|
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
|
||||||
DistilBertForSequenceClassification, DistilBertForTokenClassification
|
DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
|
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
|
||||||
CamembertForMultipleChoice, CamembertForTokenClassification
|
CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
|
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
|
||||||
|
AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||||
|
|
||||||
@@ -46,6 +49,24 @@ from .file_utils import add_start_docstrings
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
|
||||||
|
for pretrained_map in [
|
||||||
|
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
T5_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
]
|
||||||
|
for key, value, in pretrained_map.items())
|
||||||
|
|
||||||
|
|
||||||
class AutoModel(object):
|
class AutoModel(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~transformers.AutoModel` is a generic model class
|
:class:`~transformers.AutoModel` is a generic model class
|
||||||
@@ -58,6 +79,7 @@ class AutoModel(object):
|
|||||||
|
|
||||||
The base model class to instantiate is selected as the first pattern matching
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: T5Model (T5 model)
|
||||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
- contains `albert`: AlbertModel (ALBERT model)
|
- contains `albert`: AlbertModel (ALBERT model)
|
||||||
- contains `camembert`: CamembertModel (CamemBERT model)
|
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||||
@@ -130,6 +152,7 @@ class AutoModel(object):
|
|||||||
|
|
||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: T5Model (T5 model)
|
||||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
- contains `albert`: AlbertModel (ALBERT model)
|
- contains `albert`: AlbertModel (ALBERT model)
|
||||||
- contains `camembert`: CamembertModel (CamemBERT model)
|
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||||
@@ -149,6 +172,7 @@ class AutoModel(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
@@ -201,7 +225,9 @@ class AutoModel(object):
|
|||||||
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 't5' in pretrained_model_name_or_path:
|
||||||
|
return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'albert' in pretrained_model_name_or_path:
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
@@ -240,6 +266,7 @@ class AutoModelWithLMHead(object):
|
|||||||
|
|
||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: T5ModelWithLMHead (T5 model)
|
||||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||||
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||||
@@ -311,6 +338,7 @@ class AutoModelWithLMHead(object):
|
|||||||
|
|
||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: T5ModelWithLMHead (T5 model)
|
||||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||||
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||||
@@ -330,6 +358,7 @@ class AutoModelWithLMHead(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
@@ -381,7 +410,9 @@ class AutoModelWithLMHead(object):
|
|||||||
model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 't5' in pretrained_model_name_or_path:
|
||||||
|
return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'albert' in pretrained_model_name_or_path:
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
@@ -494,6 +525,7 @@ class AutoModelForSequenceClassification(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
@@ -642,6 +674,7 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
@@ -818,10 +851,10 @@ class AutoModelForTokenClassification:
|
|||||||
return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'distilbert' in pretrained_model_name_or_path:
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
|
||||||
return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'xlnet' in pretrained_model_name_or_path:
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
|||||||
@@ -48,6 +48,12 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
|
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
|
||||||
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
|
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
|
||||||
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
|
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
|
||||||
|
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
|
||||||
|
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
|
||||||
|
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
|
||||||
|
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
|
||||||
|
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
|
||||||
|
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1233,9 +1239,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
|||||||
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||||
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
|
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
|
||||||
input_ids = tokenizer.encode(input_text)
|
input_ids = tokenizer.encode(input_text)
|
||||||
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||||
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
|
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
|
||||||
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||||
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
|
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
|
||||||
# a nice puppet
|
# a nice puppet
|
||||||
|
|
||||||
|
|||||||
@@ -59,12 +59,14 @@ class PreTrainedEncoderDecoder(nn.Module):
|
|||||||
encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
|
encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
|
decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
@@ -217,9 +219,7 @@ class PreTrainedEncoderDecoder(nn.Module):
|
|||||||
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
|
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
|
||||||
if encoder_hidden_states is None:
|
if encoder_hidden_states is None:
|
||||||
encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
|
encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
|
||||||
encoder_hidden_states = encoder_outputs[
|
encoder_hidden_states = encoder_outputs[0]
|
||||||
0
|
|
||||||
] # output the last layer hidden state
|
|
||||||
else:
|
else:
|
||||||
encoder_outputs = ()
|
encoder_outputs = ()
|
||||||
|
|
||||||
|
|||||||
@@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(GPT2DoubleHeadsModel, self).__init__(config)
|
super(GPT2DoubleHeadsModel, self).__init__(config)
|
||||||
|
config.num_labels = 1
|
||||||
self.transformer = GPT2Model(config)
|
self.transformer = GPT2Model(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.multiple_choice_head = SequenceSummary(config)
|
self.multiple_choice_head = SequenceSummary(config)
|
||||||
|
|||||||
@@ -590,6 +590,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
|
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
|
||||||
|
|
||||||
|
config.num_labels = 1
|
||||||
self.transformer = OpenAIGPTModel(config)
|
self.transformer = OpenAIGPTModel(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.multiple_choice_head = SequenceSummary(config)
|
self.multiple_choice_head = SequenceSummary(config)
|
||||||
|
|||||||
886
transformers/modeling_t5.py
Normal file
886
transformers/modeling_t5.py
Normal file
@@ -0,0 +1,886 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" PyTorch T5 model. """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
|
||||||
|
from .modeling_utils import PreTrainedModel, prune_linear_layer
|
||||||
|
from .configuration_t5 import T5Config
|
||||||
|
from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# This dict contrains shortcut names and associated url
|
||||||
|
# for the pretrained weights provided with the models
|
||||||
|
####################################################
|
||||||
|
T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
|
||||||
|
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
|
||||||
|
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
|
||||||
|
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
|
||||||
|
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
|
||||||
|
}
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# This is a conversion method from TF 1.0 to PyTorch
|
||||||
|
# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
|
||||||
|
####################################################
|
||||||
|
def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
|
||||||
|
""" Load tf checkpoints in a pytorch model.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
except ImportError:
|
||||||
|
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
|
||||||
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
|
raise
|
||||||
|
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||||
|
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||||
|
# Load weights from TF model
|
||||||
|
init_vars = tf.train.list_variables(tf_path)
|
||||||
|
names = []
|
||||||
|
tf_weights = {}
|
||||||
|
for name, shape in init_vars:
|
||||||
|
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||||
|
array = tf.train.load_variable(tf_path, name)
|
||||||
|
names.append(name)
|
||||||
|
tf_weights[name] = array
|
||||||
|
|
||||||
|
for txt_name in names:
|
||||||
|
name = txt_name.split('/')
|
||||||
|
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
||||||
|
# which are not required for using pretrained model
|
||||||
|
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
|
||||||
|
logger.info("Skipping {}".format("/".join(name)))
|
||||||
|
tf_weights.pop(txt_name, None)
|
||||||
|
continue
|
||||||
|
if '_slot_' in name[-1]:
|
||||||
|
logger.info("Skipping {}".format("/".join(name)))
|
||||||
|
tf_weights.pop(txt_name, None)
|
||||||
|
continue
|
||||||
|
pointer = model
|
||||||
|
array = tf_weights[txt_name]
|
||||||
|
for m_name in name:
|
||||||
|
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
|
||||||
|
l = re.split(r'_(\d+)', m_name)
|
||||||
|
else:
|
||||||
|
l = [m_name]
|
||||||
|
if l[0] in ['kernel', 'scale', 'embedding']:
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
# elif l[0] == 'scale':
|
||||||
|
# pointer = getattr(pointer, 'weight')
|
||||||
|
# elif l[0] == 'output_bias' or l[0] == 'beta':
|
||||||
|
# pointer = getattr(pointer, 'bias')
|
||||||
|
# elif l[0] == 'squad':
|
||||||
|
# pointer = getattr(pointer, 'classifier')
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
pointer = getattr(pointer, l[0])
|
||||||
|
except AttributeError:
|
||||||
|
logger.info("Skipping {}".format("/".join(name)))
|
||||||
|
continue
|
||||||
|
if len(l) >= 2:
|
||||||
|
num = int(l[1])
|
||||||
|
pointer = pointer[num]
|
||||||
|
if l[0] not in ['kernel', 'scale', 'embedding']:
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
if l[0] != 'embedding':
|
||||||
|
logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
|
||||||
|
array = np.transpose(array)
|
||||||
|
try:
|
||||||
|
assert pointer.shape == array.shape
|
||||||
|
except AssertionError as e:
|
||||||
|
e.args += (pointer.shape, array.shape)
|
||||||
|
raise
|
||||||
|
logger.info("Initialize PyTorch weight {}".format(name))
|
||||||
|
pointer.data = torch.from_numpy(array.astype(np.float32))
|
||||||
|
tf_weights.pop(txt_name, None)
|
||||||
|
|
||||||
|
logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
|
||||||
|
# logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# PyTorch Models are constructed by sub-classing
|
||||||
|
# - torch.nn.Module for the layers and
|
||||||
|
# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
class T5LayerNorm(nn.Module):
|
||||||
|
def __init__(self, hidden_size, eps=1e-6):
|
||||||
|
""" Construct a layernorm module in the T5 style
|
||||||
|
No bias and no substraction of mean.
|
||||||
|
"""
|
||||||
|
super(T5LayerNorm, self).__init__()
|
||||||
|
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||||
|
self.variance_epsilon = eps
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
variance = x.pow(2).mean(-1, keepdim=True)
|
||||||
|
x = x / torch.sqrt(variance + self.variance_epsilon)
|
||||||
|
return self.weight * x
|
||||||
|
|
||||||
|
|
||||||
|
class T5DenseReluDense(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(T5DenseReluDense, self).__init__()
|
||||||
|
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
|
||||||
|
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
|
||||||
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
h = self.wi(hidden_states)
|
||||||
|
h = F.relu(h)
|
||||||
|
h = self.dropout(h)
|
||||||
|
h = self.wo(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
class T5LayerFF(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(T5LayerFF, self).__init__()
|
||||||
|
self.DenseReluDense = T5DenseReluDense(config)
|
||||||
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
||||||
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
norm_x = self.layer_norm(hidden_states)
|
||||||
|
y = self.DenseReluDense(norm_x)
|
||||||
|
layer_output = hidden_states + self.dropout(y)
|
||||||
|
return layer_output
|
||||||
|
|
||||||
|
|
||||||
|
class T5Attention(nn.Module):
|
||||||
|
NEW_ID = itertools.count()
|
||||||
|
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
|
super(T5Attention, self).__init__()
|
||||||
|
self.layer_id = next(T5Attention.NEW_ID)
|
||||||
|
self.is_decoder = config.is_decoder
|
||||||
|
self.has_relative_attention_bias = has_relative_attention_bias
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.relative_attention_num_buckets = config.relative_attention_num_buckets
|
||||||
|
self.d_model = config.d_model
|
||||||
|
self.d_kv = config.d_kv
|
||||||
|
self.n_heads = config.num_heads
|
||||||
|
self.dropout = config.dropout_rate
|
||||||
|
self.inner_dim = self.n_heads * self.d_kv
|
||||||
|
|
||||||
|
# Mesh TensorFlow initialization to avoid scaling before softmax
|
||||||
|
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
|
||||||
|
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
|
||||||
|
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
|
||||||
|
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
|
||||||
|
|
||||||
|
if self.has_relative_attention_bias:
|
||||||
|
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
if len(heads) == 0:
|
||||||
|
return
|
||||||
|
mask = torch.ones(self.n_heads, self.d_kv)
|
||||||
|
heads = set(heads) - self.pruned_heads
|
||||||
|
for head in heads:
|
||||||
|
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
|
mask[head] = 0
|
||||||
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
|
index = torch.arange(len(mask))[mask].long()
|
||||||
|
# Prune linear layers
|
||||||
|
self.q = prune_linear_layer(self.q, index)
|
||||||
|
self.k = prune_linear_layer(self.k, index)
|
||||||
|
self.v = prune_linear_layer(self.v, index)
|
||||||
|
self.o = prune_linear_layer(self.o, index, dim=1)
|
||||||
|
# Update hyper params
|
||||||
|
self.n_heads = self.n_heads - len(heads)
|
||||||
|
self.inner_dim = self.d_kv * self.n_heads
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _relative_position_bucket(relative_position,
|
||||||
|
bidirectional=True,
|
||||||
|
num_buckets=32,
|
||||||
|
max_distance=128):
|
||||||
|
"""
|
||||||
|
Adapted from Mesh Tensorflow:
|
||||||
|
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
|
||||||
|
|
||||||
|
Translate relative position to a bucket number for relative attention.
|
||||||
|
The relative position is defined as memory_position - query_position, i.e.
|
||||||
|
the distance in tokens from the attending position to the attended-to
|
||||||
|
position. If bidirectional=False, then positive relative positions are
|
||||||
|
invalid.
|
||||||
|
We use smaller buckets for small absolute relative_position and larger buckets
|
||||||
|
for larger absolute relative_positions. All relative positions >=max_distance
|
||||||
|
map to the same bucket. All relative positions <=-max_distance map to the
|
||||||
|
same bucket. This should allow for more graceful generalization to longer
|
||||||
|
sequences than the model has been trained on.
|
||||||
|
Args:
|
||||||
|
relative_position: an int32 Tensor
|
||||||
|
bidirectional: a boolean - whether the attention is bidirectional
|
||||||
|
num_buckets: an integer
|
||||||
|
max_distance: an integer
|
||||||
|
Returns:
|
||||||
|
a Tensor with the same shape as relative_position, containing int32
|
||||||
|
values in the range [0, num_buckets)
|
||||||
|
"""
|
||||||
|
ret = 0
|
||||||
|
n = -relative_position
|
||||||
|
if bidirectional:
|
||||||
|
num_buckets //= 2
|
||||||
|
ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets
|
||||||
|
n = torch.abs(n)
|
||||||
|
else:
|
||||||
|
n = torch.max(n, torch.zeros_like(n))
|
||||||
|
# now n is in the range [0, inf)
|
||||||
|
|
||||||
|
# half of the buckets are for exact increments in positions
|
||||||
|
max_exact = num_buckets // 2
|
||||||
|
is_small = (n < max_exact)
|
||||||
|
|
||||||
|
# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
|
||||||
|
val_if_large = max_exact + (
|
||||||
|
torch.log(n.float() / max_exact)
|
||||||
|
/ math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
|
||||||
|
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
|
||||||
|
|
||||||
|
ret += torch.where(is_small, n, val_if_large)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def compute_bias(self, qlen, klen):
|
||||||
|
""" Compute binned relative position bias """
|
||||||
|
context_position = torch.arange(qlen, dtype=torch.long)[:, None]
|
||||||
|
memory_position = torch.arange(klen, dtype=torch.long)[None, :]
|
||||||
|
relative_position = memory_position - context_position # shape (qlen, klen)
|
||||||
|
rp_bucket = self._relative_position_bucket(relative_position, # shape (qlen, klen)
|
||||||
|
bidirectional=not self.is_decoder,
|
||||||
|
num_buckets=self.relative_attention_num_buckets)
|
||||||
|
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
|
||||||
|
values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
|
||||||
|
return values
|
||||||
|
|
||||||
|
def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
|
||||||
|
"""
|
||||||
|
Self-attention (if kv is None) or attention over source sentence (provided by kv).
|
||||||
|
"""
|
||||||
|
# Input is (bs, qlen, dim)
|
||||||
|
# Mask is (bs, klen) (non-causal) or (bs, klen, klen)
|
||||||
|
bs, qlen, dim = input.size()
|
||||||
|
if kv is None:
|
||||||
|
klen = qlen if cache is None else cache['slen'] + qlen
|
||||||
|
else:
|
||||||
|
klen = kv.size(1)
|
||||||
|
|
||||||
|
def shape(x):
|
||||||
|
""" projection """
|
||||||
|
return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2)
|
||||||
|
|
||||||
|
def unshape(x):
|
||||||
|
""" compute context """
|
||||||
|
return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
|
||||||
|
|
||||||
|
q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
if kv is None:
|
||||||
|
k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
elif cache is None or self.layer_id not in cache:
|
||||||
|
k = v = kv
|
||||||
|
k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
|
||||||
|
if cache is not None:
|
||||||
|
if self.layer_id in cache:
|
||||||
|
if kv is None:
|
||||||
|
k_, v_ = cache[self.layer_id]
|
||||||
|
k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head)
|
||||||
|
v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head)
|
||||||
|
else:
|
||||||
|
k, v = cache[self.layer_id]
|
||||||
|
cache[self.layer_id] = (k, v)
|
||||||
|
|
||||||
|
# q = q / math.sqrt(dim_per_head) # No scaling in T5
|
||||||
|
scores = torch.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
if position_bias is None:
|
||||||
|
if not self.has_relative_attention_bias:
|
||||||
|
raise ValueError("No position_bias provided and no weights to compute position_bias")
|
||||||
|
position_bias = self.compute_bias(qlen, klen)
|
||||||
|
if mask is not None:
|
||||||
|
position_bias = position_bias + mask # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
scores += position_bias
|
||||||
|
weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen)
|
||||||
|
weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
weights = weights * head_mask
|
||||||
|
|
||||||
|
context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
context = unshape(context) # (bs, qlen, dim)
|
||||||
|
|
||||||
|
context = self.o(context)
|
||||||
|
|
||||||
|
outputs = (context,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (weights,)
|
||||||
|
if self.has_relative_attention_bias:
|
||||||
|
outputs = outputs + (position_bias,)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class T5LayerSelfAttention(nn.Module):
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
|
super(T5LayerSelfAttention, self).__init__()
|
||||||
|
self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
|
||||||
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
||||||
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
|
||||||
|
norm_x = self.layer_norm(hidden_states)
|
||||||
|
attention_output = self.SelfAttention(norm_x,
|
||||||
|
mask=attention_mask,
|
||||||
|
position_bias=position_bias,
|
||||||
|
head_mask=head_mask)
|
||||||
|
y = attention_output[0]
|
||||||
|
layer_output = hidden_states + self.dropout(y)
|
||||||
|
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class T5LayerCrossAttention(nn.Module):
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
|
super(T5LayerCrossAttention, self).__init__()
|
||||||
|
self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
|
||||||
|
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
||||||
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
|
||||||
|
norm_x = self.layer_norm(hidden_states)
|
||||||
|
attention_output = self.EncDecAttention(norm_x,
|
||||||
|
mask=attention_mask,
|
||||||
|
kv=kv,
|
||||||
|
position_bias=position_bias,
|
||||||
|
head_mask=head_mask)
|
||||||
|
y = attention_output[0]
|
||||||
|
layer_output = hidden_states + self.dropout(y)
|
||||||
|
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class T5Block(nn.Module):
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False):
|
||||||
|
super(T5Block, self).__init__()
|
||||||
|
self.is_decoder = config.is_decoder
|
||||||
|
self.layer = nn.ModuleList()
|
||||||
|
self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
|
||||||
|
if self.is_decoder:
|
||||||
|
self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias))
|
||||||
|
self.layer.append(T5LayerFF(config))
|
||||||
|
else:
|
||||||
|
self.layer.append(T5LayerFF(config))
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, position_bias=None,
|
||||||
|
encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
|
||||||
|
head_mask=None):
|
||||||
|
self_attention_outputs = self.layer[0](hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_bias=position_bias,
|
||||||
|
head_mask=head_mask)
|
||||||
|
hidden_states = self_attention_outputs[0]
|
||||||
|
outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights
|
||||||
|
|
||||||
|
if not self.is_decoder:
|
||||||
|
hidden_states = self.layer[1](hidden_states)
|
||||||
|
else:
|
||||||
|
cross_attention_outputs = self.layer[1](hidden_states,
|
||||||
|
kv=encoder_hidden_states,
|
||||||
|
attention_mask=encoder_attention_mask,
|
||||||
|
position_bias=encoder_decoder_position_bias,
|
||||||
|
head_mask=head_mask)
|
||||||
|
hidden_states = cross_attention_outputs[0]
|
||||||
|
outputs = outputs + cross_attention_outputs[1:] # Keep cross-attention outputs and relative position weights
|
||||||
|
hidden_states = self.layer[2](hidden_states)
|
||||||
|
|
||||||
|
outputs = (hidden_states,) + outputs # add attentions if we output them
|
||||||
|
return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
|
||||||
|
|
||||||
|
|
||||||
|
class T5PreTrainedModel(PreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = T5Config
|
||||||
|
pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_tf_weights = load_tf_weights_in_t5
|
||||||
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dummy_inputs(self):
|
||||||
|
input_ids = torch.tensor(DUMMY_INPUTS)
|
||||||
|
input_mask = torch.tensor(DUMMY_MASK)
|
||||||
|
dummy_inputs = {'decoder_input_ids': input_ids,
|
||||||
|
'encoder_input_ids': input_ids,
|
||||||
|
'decoder_attention_mask': input_mask}
|
||||||
|
return dummy_inputs
|
||||||
|
|
||||||
|
def _init_weights(self, module):
|
||||||
|
""" Initialize the weights """
|
||||||
|
factor = self.config.initializer_factor # Used for testing weights initialization
|
||||||
|
if isinstance(module, T5LayerNorm):
|
||||||
|
module.weight.data.fill_(factor*1.0)
|
||||||
|
elif isinstance(module, (T5Model, T5WithLMHeadModel)):
|
||||||
|
# Mesh TensorFlow embeddings initialization
|
||||||
|
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
|
||||||
|
module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
|
||||||
|
elif isinstance(module, T5DenseReluDense):
|
||||||
|
# Mesh TensorFlow FF initialization
|
||||||
|
# See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
|
||||||
|
# and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
|
||||||
|
module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
|
||||||
|
if hasattr(module.wi, 'bias') and module.wi.bias is not None:
|
||||||
|
module.wi.bias.data.zero_()
|
||||||
|
module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
|
||||||
|
if hasattr(module.wo, 'bias') and module.wo.bias is not None:
|
||||||
|
module.wo.bias.data.zero_()
|
||||||
|
elif isinstance(module, T5Attention):
|
||||||
|
# Mesh TensorFlow attention initialization to avoid scaling before softmax
|
||||||
|
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
|
||||||
|
d_model = self.config.d_model
|
||||||
|
d_kv = self.config.d_kv
|
||||||
|
n_heads = self.config.num_heads
|
||||||
|
module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
|
||||||
|
module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
|
||||||
|
module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
|
||||||
|
module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
|
||||||
|
if module.has_relative_attention_bias:
|
||||||
|
module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
|
||||||
|
|
||||||
|
|
||||||
|
class T5Stack(T5PreTrainedModel):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(T5Stack, self).__init__(config)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.is_decoder = config.is_decoder
|
||||||
|
|
||||||
|
self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
|
||||||
|
for i in range(config.num_layers)])
|
||||||
|
self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
|
||||||
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self,
|
||||||
|
hidden_states,
|
||||||
|
attention_mask=None,
|
||||||
|
encoder_hidden_states=None,
|
||||||
|
encoder_attention_mask=None,
|
||||||
|
head_mask=None):
|
||||||
|
|
||||||
|
batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
|
||||||
|
if self.is_decoder and encoder_attention_mask is None:
|
||||||
|
encoder_seq_length = encoder_hidden_states.shape[1]
|
||||||
|
encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)
|
||||||
|
|
||||||
|
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
||||||
|
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||||
|
if attention_mask.dim() == 3:
|
||||||
|
extended_attention_mask = attention_mask[:, None, :, :]
|
||||||
|
elif attention_mask.dim() == 2:
|
||||||
|
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||||
|
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||||
|
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
|
if self.config.is_decoder:
|
||||||
|
seq_ids = torch.arange(seq_length, device=hidden_states.device)
|
||||||
|
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
|
||||||
|
causal_mask = causal_mask.to(attention_mask)
|
||||||
|
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
||||||
|
else:
|
||||||
|
extended_attention_mask = attention_mask[:, None, None, :]
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -1e9 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
|
||||||
|
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
|
||||||
|
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
|
||||||
|
# extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
|
||||||
|
|
||||||
|
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
|
||||||
|
|
||||||
|
if self.is_decoder:
|
||||||
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
|
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||||
|
if encoder_attention_mask.dim() == 3:
|
||||||
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||||
|
if encoder_attention_mask.dim() == 2:
|
||||||
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
||||||
|
|
||||||
|
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
|
||||||
|
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
|
||||||
|
# encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
|
||||||
|
|
||||||
|
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
|
||||||
|
else:
|
||||||
|
encoder_extended_attention_mask = None
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if head_mask is not None:
|
||||||
|
if head_mask.dim() == 1:
|
||||||
|
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||||
|
head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
|
||||||
|
elif head_mask.dim() == 2:
|
||||||
|
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
|
||||||
|
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.config.num_layers
|
||||||
|
|
||||||
|
all_hidden_states = ()
|
||||||
|
all_attentions = ()
|
||||||
|
position_bias = None
|
||||||
|
encoder_decoder_position_bias = None
|
||||||
|
|
||||||
|
hidden_states = self.dropout(hidden_states)
|
||||||
|
for i, layer_module in enumerate(self.block):
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
layer_outputs = layer_module(hidden_states,
|
||||||
|
attention_mask=extended_attention_mask,
|
||||||
|
position_bias=position_bias,
|
||||||
|
encoder_hidden_states=encoder_hidden_states,
|
||||||
|
encoder_attention_mask=encoder_extended_attention_mask,
|
||||||
|
encoder_decoder_position_bias=encoder_decoder_position_bias,
|
||||||
|
head_mask=head_mask[i])
|
||||||
|
# layer_outputs is a tuple with:
|
||||||
|
# hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
|
||||||
|
hidden_states = layer_outputs[0]
|
||||||
|
if i == 0:
|
||||||
|
# We share the position biases between the layers - the first layer store them
|
||||||
|
# layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
|
||||||
|
position_bias = layer_outputs[2 if self.output_attentions else 1]
|
||||||
|
if self.is_decoder:
|
||||||
|
encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions = all_attentions + (layer_outputs[1],) # We keep only self-attention weights for now
|
||||||
|
|
||||||
|
hidden_states = self.final_layer_norm(hidden_states)
|
||||||
|
layer_output = self.dropout(hidden_states)
|
||||||
|
|
||||||
|
# Add last layer
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
T5_START_DOCSTRING = r""" The T5 model was proposed in
|
||||||
|
`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
|
||||||
|
by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
|
||||||
|
It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
|
||||||
|
|
||||||
|
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
|
||||||
|
refer to the PyTorch documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
|
||||||
|
https://arxiv.org/abs/1910.10683
|
||||||
|
|
||||||
|
.. _`torch.nn.Module`:
|
||||||
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
T5_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
|
T5 is a model with relative position embeddings so you should be able to pad the inputs on
|
||||||
|
the right or the left.
|
||||||
|
|
||||||
|
Indices can be obtained using :class:`transformers.T5Tokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
|
||||||
|
"without any specific head on top.",
|
||||||
|
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
|
||||||
|
class T5Model(T5PreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||||
|
model = T5Model.from_pretrained('t5-small')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids=input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(T5Model, self).__init__(config)
|
||||||
|
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||||
|
|
||||||
|
encoder_config = copy.deepcopy(config)
|
||||||
|
self.encoder = T5Stack(encoder_config)
|
||||||
|
|
||||||
|
decoder_config = copy.deepcopy(config)
|
||||||
|
decoder_config.is_decoder = True
|
||||||
|
self.decoder = T5Stack(decoder_config)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.shared
|
||||||
|
|
||||||
|
def set_input_embeddings(self, new_embeddings):
|
||||||
|
self.shared = new_embeddings
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
""" Prunes heads of the model.
|
||||||
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
|
See base class PreTrainedModel
|
||||||
|
"""
|
||||||
|
for layer, heads in heads_to_prune.items():
|
||||||
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
|
def forward(self, **kwargs):
|
||||||
|
# keyword arguments come in 3 flavors: encoder-specific (prefixed by
|
||||||
|
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
|
||||||
|
# that apply to the model as whole.
|
||||||
|
# We let the specific kwargs override the common ones in case of conflict.
|
||||||
|
kwargs_common = dict((k, v) for k, v in kwargs.items()
|
||||||
|
if not k.startswith("encoder_") and not k.startswith("decoder_"))
|
||||||
|
kwargs_encoder = kwargs_common.copy()
|
||||||
|
kwargs_decoder = kwargs_common.copy()
|
||||||
|
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
|
||||||
|
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
|
||||||
|
|
||||||
|
# Encode if needed (training, first prediction pass)
|
||||||
|
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
|
||||||
|
encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
|
||||||
|
if encoder_hidden_states is None:
|
||||||
|
# Convert encoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
|
||||||
|
|
||||||
|
if encoder_attention_mask is not None:
|
||||||
|
# Apply masking
|
||||||
|
encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states)
|
||||||
|
hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1)
|
||||||
|
|
||||||
|
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
|
||||||
|
encoder_hidden_states = encoder_outputs[0]
|
||||||
|
else:
|
||||||
|
encoder_outputs = ()
|
||||||
|
|
||||||
|
# Decode
|
||||||
|
# Convert decoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(decoder_inputs_ids)
|
||||||
|
|
||||||
|
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
|
||||||
|
kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
|
||||||
|
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
|
||||||
|
|
||||||
|
return decoder_outputs + encoder_outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
|
||||||
|
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
|
||||||
|
class T5WithLMHeadModel(T5PreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the masked language modeling loss.
|
||||||
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
|
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Masked language modeling loss.
|
||||||
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||||
|
model = T5WithLMHeadModel.from_pretrained('t5-small')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids=input_ids, lm_labels=input_ids)
|
||||||
|
loss, prediction_scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(T5WithLMHeadModel, self).__init__(config)
|
||||||
|
self.model_dim = config.d_model
|
||||||
|
|
||||||
|
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||||
|
|
||||||
|
encoder_config = copy.deepcopy(config)
|
||||||
|
self.encoder = T5Stack(encoder_config)
|
||||||
|
|
||||||
|
decoder_config = copy.deepcopy(config)
|
||||||
|
decoder_config.is_decoder = True
|
||||||
|
self.decoder = T5Stack(decoder_config)
|
||||||
|
|
||||||
|
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.shared
|
||||||
|
|
||||||
|
def set_input_embeddings(self, new_embeddings):
|
||||||
|
self.shared = new_embeddings
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.lm_head
|
||||||
|
|
||||||
|
def forward(self, **kwargs):
|
||||||
|
# keyword arguments come in 3 flavors: encoder-specific (prefixed by
|
||||||
|
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
|
||||||
|
# that apply to the model as whole.
|
||||||
|
# We let the specific kwargs override the common ones in case of conflict.
|
||||||
|
|
||||||
|
lm_labels = kwargs.pop('decoder_lm_labels', None)
|
||||||
|
|
||||||
|
kwargs_common = dict((k, v) for k, v in kwargs.items()
|
||||||
|
if not k.startswith("encoder_") and not k.startswith("decoder_"))
|
||||||
|
kwargs_encoder = kwargs_common.copy()
|
||||||
|
kwargs_decoder = kwargs_common.copy()
|
||||||
|
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
|
||||||
|
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
|
||||||
|
|
||||||
|
# Encode if needed (training, first prediction pass)
|
||||||
|
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
|
||||||
|
if encoder_hidden_states is None:
|
||||||
|
# Convert encoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
|
||||||
|
|
||||||
|
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
|
||||||
|
encoder_hidden_states = encoder_outputs[0]
|
||||||
|
else:
|
||||||
|
encoder_outputs = ()
|
||||||
|
|
||||||
|
# Decode
|
||||||
|
# Convert decoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(decoder_inputs_ids)
|
||||||
|
|
||||||
|
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
|
||||||
|
kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
|
||||||
|
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
|
||||||
|
|
||||||
|
sequence_output = decoder_outputs[0]
|
||||||
|
# Rescale output before projecting on vocab
|
||||||
|
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
|
||||||
|
sequence_output = sequence_output * (self.model_dim ** -0.5)
|
||||||
|
lm_logits = self.lm_head(sequence_output)
|
||||||
|
|
||||||
|
decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here
|
||||||
|
if lm_labels is not None:
|
||||||
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
|
shift_labels = lm_labels[..., 1:].contiguous()
|
||||||
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1))
|
||||||
|
decoder_outputs = (loss,) + decoder_outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
|
||||||
|
|
||||||
|
return decoder_outputs + encoder_outputs
|
||||||
@@ -587,8 +587,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from transformers import AlbertTokenizer, TFAlbertModel
|
from transformers import AlbertTokenizer, TFAlbertModel
|
||||||
|
|
||||||
tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
|
||||||
model = TFAlbertModel.from_pretrained('bert-base-uncased')
|
model = TFAlbertModel.from_pretrained('albert-base-v1')
|
||||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|||||||
@@ -23,22 +23,43 @@ from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig,
|
|||||||
TransfoXLConfig, XLMConfig, XLNetConfig)
|
TransfoXLConfig, XLMConfig, XLNetConfig)
|
||||||
|
|
||||||
from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
|
from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
|
||||||
TFBertForQuestionAnswering, TFBertForTokenClassification
|
TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
|
from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
|
from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
|
from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
|
from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
|
||||||
TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification
|
TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
|
from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \
|
||||||
from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification
|
TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification
|
from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \
|
||||||
from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
|
TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .file_utils import add_start_docstrings
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
|
||||||
|
for pretrained_map in [
|
||||||
|
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
]
|
||||||
|
for key, value, in pretrained_map.items())
|
||||||
|
|
||||||
|
|
||||||
class TFAutoModel(object):
|
class TFAutoModel(object):
|
||||||
r"""
|
r"""
|
||||||
:class:`~transformers.TFAutoModel` is a generic model class
|
:class:`~transformers.TFAutoModel` is a generic model class
|
||||||
@@ -51,6 +72,7 @@ class TFAutoModel(object):
|
|||||||
|
|
||||||
The base model class to instantiate is selected as the first pattern matching
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: TFT5Model (T5 model)
|
||||||
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
||||||
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
||||||
- contains `bert`: TFBertModel (Bert model)
|
- contains `bert`: TFBertModel (Bert model)
|
||||||
@@ -117,6 +139,7 @@ class TFAutoModel(object):
|
|||||||
|
|
||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: TFT5Model (T5 model)
|
||||||
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
- contains `distilbert`: TFDistilBertModel (DistilBERT model)
|
||||||
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
- contains `roberta`: TFRobertaModel (RoBERTa model)
|
||||||
- contains `bert`: TFTFBertModel (Bert model)
|
- contains `bert`: TFTFBertModel (Bert model)
|
||||||
@@ -130,6 +153,7 @@ class TFAutoModel(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
@@ -185,8 +209,12 @@ class TFAutoModel(object):
|
|||||||
model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 't5' in pretrained_model_name_or_path:
|
||||||
|
return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -221,6 +249,7 @@ class TFAutoModelWithLMHead(object):
|
|||||||
|
|
||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: TFT5WithLMHeadModel (T5 model)
|
||||||
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
||||||
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
||||||
- contains `bert`: TFBertForMaskedLM (Bert model)
|
- contains `bert`: TFBertForMaskedLM (Bert model)
|
||||||
@@ -290,6 +319,7 @@ class TFAutoModelWithLMHead(object):
|
|||||||
|
|
||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `t5`: TFT5WithLMHeadModel (T5 model)
|
||||||
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
|
||||||
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
|
||||||
- contains `bert`: TFBertForMaskedLM (Bert model)
|
- contains `bert`: TFBertForMaskedLM (Bert model)
|
||||||
@@ -304,6 +334,7 @@ class TFAutoModelWithLMHead(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
@@ -359,8 +390,12 @@ class TFAutoModelWithLMHead(object):
|
|||||||
model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 't5' in pretrained_model_name_or_path:
|
||||||
|
return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -461,6 +496,7 @@ class TFAutoModelForSequenceClassification(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
@@ -518,6 +554,8 @@ class TFAutoModelForSequenceClassification(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -604,6 +642,7 @@ class TFAutoModelForQuestionAnswering(object):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
|
||||||
|
|
||||||
|
|||||||
@@ -48,6 +48,12 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
|
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
|
||||||
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
|
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
|
||||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
|
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
|
||||||
|
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
|
||||||
|
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
|
||||||
|
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
|
||||||
|
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
|
||||||
|
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
|
||||||
|
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -129,7 +135,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: if mode is not valid.
|
ValueError: if mode is not valid.
|
||||||
|
|
||||||
Shared weights logic adapted from
|
Shared weights logic adapted from
|
||||||
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||||
"""
|
"""
|
||||||
@@ -148,7 +154,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
input_shape = shape_list(input_ids)
|
input_shape = shape_list(input_ids)
|
||||||
else:
|
else:
|
||||||
input_shape = shape_list(inputs_embeds)[:-1]
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
|
||||||
seq_length = input_shape[1]
|
seq_length = input_shape[1]
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
@@ -246,7 +252,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||||||
context_layer = tf.matmul(attention_probs, value_layer)
|
context_layer = tf.matmul(attention_probs, value_layer)
|
||||||
|
|
||||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||||
context_layer = tf.reshape(context_layer,
|
context_layer = tf.reshape(context_layer,
|
||||||
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||||
|
|
||||||
outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
|
outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
|
||||||
@@ -591,7 +597,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
|
|||||||
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
"""
|
"""
|
||||||
@@ -605,13 +611,13 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
(a) For sequence pairs:
|
(a) For sequence pairs:
|
||||||
|
|
||||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||||
|
|
||||||
(b) For single sequences:
|
(b) For single sequences:
|
||||||
|
|
||||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0``
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
|||||||
@@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
config.num_labels = 1
|
||||||
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||||
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
||||||
|
|
||||||
|
|||||||
@@ -538,6 +538,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
config.num_labels = 1
|
||||||
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
|
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
|
||||||
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
||||||
|
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
|
|||||||
logger.info("Loading PyTorch weights from {}".format(pt_path))
|
logger.info("Loading PyTorch weights from {}".format(pt_path))
|
||||||
|
|
||||||
pt_state_dict = torch.load(pt_path, map_location='cpu')
|
pt_state_dict = torch.load(pt_path, map_location='cpu')
|
||||||
|
logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
|
||||||
|
|
||||||
return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
|
return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
|
||||||
|
|
||||||
@@ -134,7 +135,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
|||||||
start_prefix_to_remove = tf_model.base_model_prefix + '.'
|
start_prefix_to_remove = tf_model.base_model_prefix + '.'
|
||||||
|
|
||||||
symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
|
symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
|
||||||
|
tf_loaded_numel = 0
|
||||||
weight_value_tuples = []
|
weight_value_tuples = []
|
||||||
all_pytorch_weights = set(list(pt_state_dict.keys()))
|
all_pytorch_weights = set(list(pt_state_dict.keys()))
|
||||||
for symbolic_weight in symbolic_weights:
|
for symbolic_weight in symbolic_weights:
|
||||||
@@ -142,7 +143,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
|||||||
name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
|
name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
|
||||||
|
|
||||||
# Find associated numpy array in pytorch model state dict
|
# Find associated numpy array in pytorch model state dict
|
||||||
assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
|
if name not in pt_state_dict:
|
||||||
|
if allow_missing_keys:
|
||||||
|
continue
|
||||||
|
raise AttributeError("{} not found in PyTorch model".format(name))
|
||||||
|
|
||||||
array = pt_state_dict[name].numpy()
|
array = pt_state_dict[name].numpy()
|
||||||
|
|
||||||
if transpose:
|
if transpose:
|
||||||
@@ -159,7 +164,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
|||||||
e.args += (symbolic_weight.shape, array.shape)
|
e.args += (symbolic_weight.shape, array.shape)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
logger.info("Initialize TF weight {}".format(symbolic_weight.name))
|
tf_loaded_numel += array.size
|
||||||
|
# logger.warning("Initialize TF weight {}".format(symbolic_weight.name))
|
||||||
|
|
||||||
weight_value_tuples.append((symbolic_weight, array))
|
weight_value_tuples.append((symbolic_weight, array))
|
||||||
all_pytorch_weights.discard(name)
|
all_pytorch_weights.discard(name)
|
||||||
@@ -169,6 +175,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
|||||||
if tf_inputs is not None:
|
if tf_inputs is not None:
|
||||||
tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run
|
tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run
|
||||||
|
|
||||||
|
logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
|
||||||
|
|
||||||
logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
|
logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
|
||||||
|
|
||||||
return tf_model
|
return tf_model
|
||||||
@@ -246,6 +254,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
|
|||||||
|
|
||||||
all_tf_weights = set(list(tf_weights_map.keys()))
|
all_tf_weights = set(list(tf_weights_map.keys()))
|
||||||
loaded_pt_weights_data_ptr = {}
|
loaded_pt_weights_data_ptr = {}
|
||||||
|
missing_keys_pt = []
|
||||||
for pt_weight_name, pt_weight in current_pt_params_dict.items():
|
for pt_weight_name, pt_weight in current_pt_params_dict.items():
|
||||||
# Handle PyTorch shared weight ()not duplicated in TF 2.0
|
# Handle PyTorch shared weight ()not duplicated in TF 2.0
|
||||||
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
|
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
|
||||||
@@ -254,7 +263,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
|
|||||||
|
|
||||||
# Find associated numpy array in pytorch model state dict
|
# Find associated numpy array in pytorch model state dict
|
||||||
if pt_weight_name not in tf_weights_map:
|
if pt_weight_name not in tf_weights_map:
|
||||||
raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
|
if allow_missing_keys:
|
||||||
|
missing_keys_pt.append(pt_weight_name)
|
||||||
|
continue
|
||||||
|
raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name))
|
||||||
|
|
||||||
array, transpose = tf_weights_map[pt_weight_name]
|
array, transpose = tf_weights_map[pt_weight_name]
|
||||||
|
|
||||||
@@ -272,13 +284,14 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
|
|||||||
e.args += (pt_weight.shape, array.shape)
|
e.args += (pt_weight.shape, array.shape)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
|
# logger.warning("Initialize PyTorch weight {}".format(pt_weight_name))
|
||||||
|
|
||||||
new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
|
new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
|
||||||
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
|
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
|
||||||
all_tf_weights.discard(pt_weight_name)
|
all_tf_weights.discard(pt_weight_name)
|
||||||
|
|
||||||
missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
|
missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
|
||||||
|
missing_keys += missing_keys_pt
|
||||||
|
|
||||||
if len(missing_keys) > 0:
|
if len(missing_keys) > 0:
|
||||||
logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
|
logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
|
||||||
|
|||||||
775
transformers/modeling_tf_t5.py
Normal file
775
transformers/modeling_tf_t5.py
Normal file
@@ -0,0 +1,775 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 T5 model. """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .configuration_t5 import T5Config
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
|
||||||
|
from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
|
||||||
|
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
|
||||||
|
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
|
||||||
|
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
|
||||||
|
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
|
||||||
|
}
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# TF 2.0 Models are constructed using Keras imperative API by sub-classing
|
||||||
|
# - tf.keras.layers.Layer for the layers and
|
||||||
|
# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
class TFT5LayerNorm(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, epsilon=1e-6, **kwargs):
|
||||||
|
""" Construct a layernorm module in the T5 style
|
||||||
|
No bias and no substraction of mean.
|
||||||
|
"""
|
||||||
|
super(TFT5LayerNorm, self).__init__(**kwargs)
|
||||||
|
self.variance_epsilon = epsilon
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
"""Build shared word embedding layer """
|
||||||
|
self.weight = self.add_weight(
|
||||||
|
"weight",
|
||||||
|
shape=(input_shape[-1],),
|
||||||
|
initializer='ones')
|
||||||
|
super(TFT5LayerNorm, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, x):
|
||||||
|
variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
|
||||||
|
x = x * tf.math.rsqrt(variance + self.variance_epsilon)
|
||||||
|
return self.weight * x
|
||||||
|
|
||||||
|
|
||||||
|
class TFT5DenseReluDense(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFT5DenseReluDense, self).__init__(**kwargs)
|
||||||
|
self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi')
|
||||||
|
self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
|
self.act = tf.keras.activations.relu
|
||||||
|
|
||||||
|
def call(self, hidden_states, training=False):
|
||||||
|
h = self.wi(hidden_states)
|
||||||
|
h = self.act(h)
|
||||||
|
h = self.dropout(h, training=training)
|
||||||
|
h = self.wo(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
class TFT5LayerFF(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFT5LayerFF, self).__init__(**kwargs)
|
||||||
|
self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
|
||||||
|
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
|
||||||
|
name='layer_norm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def call(self, hidden_states, training=False):
|
||||||
|
norm_x = self.layer_norm(hidden_states)
|
||||||
|
y = self.DenseReluDense(norm_x, training=training)
|
||||||
|
layer_output = hidden_states + self.dropout(y, training=training)
|
||||||
|
return layer_output
|
||||||
|
|
||||||
|
|
||||||
|
class TFT5Attention(tf.keras.layers.Layer):
|
||||||
|
NEW_ID = itertools.count()
|
||||||
|
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
|
super(TFT5Attention, self).__init__(**kwargs)
|
||||||
|
self.layer_id = next(TFT5Attention.NEW_ID)
|
||||||
|
self.is_decoder = config.is_decoder
|
||||||
|
self.has_relative_attention_bias = has_relative_attention_bias
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.relative_attention_num_buckets = config.relative_attention_num_buckets
|
||||||
|
self.d_model = config.d_model
|
||||||
|
self.d_kv = config.d_kv
|
||||||
|
self.n_heads = config.num_heads
|
||||||
|
self.inner_dim = self.n_heads * self.d_kv
|
||||||
|
|
||||||
|
# Mesh TensorFlow initialization to avoid scaling before softmax
|
||||||
|
self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q')
|
||||||
|
self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k')
|
||||||
|
self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v')
|
||||||
|
self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
if self.has_relative_attention_bias:
|
||||||
|
self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets,
|
||||||
|
self.n_heads,
|
||||||
|
name='relative_attention_bias')
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _relative_position_bucket(relative_position,
|
||||||
|
bidirectional=True,
|
||||||
|
num_buckets=32,
|
||||||
|
max_distance=128):
|
||||||
|
"""
|
||||||
|
Adapted from Mesh Tensorflow:
|
||||||
|
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
|
||||||
|
|
||||||
|
Translate relative position to a bucket number for relative attention.
|
||||||
|
The relative position is defined as memory_position - query_position, i.e.
|
||||||
|
the distance in tokens from the attending position to the attended-to
|
||||||
|
position. If bidirectional=False, then positive relative positions are
|
||||||
|
invalid.
|
||||||
|
We use smaller buckets for small absolute relative_position and larger buckets
|
||||||
|
for larger absolute relative_positions. All relative positions >=max_distance
|
||||||
|
map to the same bucket. All relative positions <=-max_distance map to the
|
||||||
|
same bucket. This should allow for more graceful generalization to longer
|
||||||
|
sequences than the model has been trained on.
|
||||||
|
Args:
|
||||||
|
relative_position: an int32 Tensor
|
||||||
|
bidirectional: a boolean - whether the attention is bidirectional
|
||||||
|
num_buckets: an integer
|
||||||
|
max_distance: an integer
|
||||||
|
Returns:
|
||||||
|
a Tensor with the same shape as relative_position, containing int32
|
||||||
|
values in the range [0, num_buckets)
|
||||||
|
"""
|
||||||
|
ret = 0
|
||||||
|
n = -relative_position
|
||||||
|
if bidirectional:
|
||||||
|
num_buckets //= 2
|
||||||
|
ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets
|
||||||
|
n = tf.math.abs(n)
|
||||||
|
else:
|
||||||
|
n = tf.math.maximum(n, 0)
|
||||||
|
# now n is in the range [0, inf)
|
||||||
|
max_exact = num_buckets // 2
|
||||||
|
is_small = tf.math.less(n, max_exact)
|
||||||
|
val_if_large = max_exact + tf.dtypes.cast(
|
||||||
|
tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
|
||||||
|
/ math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32)
|
||||||
|
val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
|
||||||
|
ret += tf.where(is_small, n, val_if_large)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def compute_bias(self, qlen, klen):
|
||||||
|
""" Compute binned relative position bias """
|
||||||
|
context_position = tf.range(qlen)[:, None]
|
||||||
|
memory_position = tf.range(klen)[None, :]
|
||||||
|
relative_position = memory_position - context_position # shape (qlen, klen)
|
||||||
|
rp_bucket = self._relative_position_bucket(relative_position,
|
||||||
|
bidirectional=not self.is_decoder,
|
||||||
|
num_buckets=self.relative_attention_num_buckets)
|
||||||
|
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
|
||||||
|
values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
|
||||||
|
return values
|
||||||
|
|
||||||
|
def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
|
||||||
|
"""
|
||||||
|
Self-attention (if kv is None) or attention over source sentence (provided by kv).
|
||||||
|
"""
|
||||||
|
# Input is (bs, qlen, dim)
|
||||||
|
# Mask is (bs, klen) (non-causal) or (bs, klen, klen)
|
||||||
|
bs, qlen, dim = shape_list(input)
|
||||||
|
if kv is None:
|
||||||
|
klen = qlen if cache is None else cache['slen'] + qlen
|
||||||
|
else:
|
||||||
|
klen = shape_list(kv)[1]
|
||||||
|
|
||||||
|
def shape(x):
|
||||||
|
""" projection """
|
||||||
|
return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3))
|
||||||
|
|
||||||
|
def unshape(x):
|
||||||
|
""" compute context """
|
||||||
|
return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
|
||||||
|
|
||||||
|
q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
if kv is None:
|
||||||
|
k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
elif cache is None or self.layer_id not in cache:
|
||||||
|
k = v = kv
|
||||||
|
k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
|
||||||
|
if cache is not None:
|
||||||
|
if self.layer_id in cache:
|
||||||
|
if kv is None:
|
||||||
|
k_, v_ = cache[self.layer_id]
|
||||||
|
k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head)
|
||||||
|
v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head)
|
||||||
|
else:
|
||||||
|
k, v = cache[self.layer_id]
|
||||||
|
cache[self.layer_id] = (k, v)
|
||||||
|
|
||||||
|
# q = q / math.sqrt(dim_per_head) # No scaling in T5
|
||||||
|
# scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen)
|
||||||
|
scores = tf.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
if position_bias is None:
|
||||||
|
if not self.has_relative_attention_bias:
|
||||||
|
raise ValueError("No position_bias provided and no weights to compute position_bias")
|
||||||
|
position_bias = self.compute_bias(qlen, klen)
|
||||||
|
if mask is not None:
|
||||||
|
position_bias = position_bias + mask
|
||||||
|
# mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen)
|
||||||
|
# scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
scores += position_bias
|
||||||
|
weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen)
|
||||||
|
weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
weights = weights * head_mask
|
||||||
|
|
||||||
|
context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
context = unshape(context) # (bs, qlen, dim)
|
||||||
|
|
||||||
|
context = self.o(context)
|
||||||
|
|
||||||
|
outputs = (context,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (weights,)
|
||||||
|
if self.has_relative_attention_bias:
|
||||||
|
outputs = outputs + (position_bias,)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFT5LayerSelfAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
|
super(TFT5LayerSelfAttention, self).__init__(**kwargs)
|
||||||
|
self.SelfAttention = TFT5Attention(config,
|
||||||
|
has_relative_attention_bias=has_relative_attention_bias,
|
||||||
|
name='SelfAttention')
|
||||||
|
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
|
||||||
|
name='layer_norm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def call(self, hidden_states, attention_mask=None, position_bias=None,
|
||||||
|
head_mask=None, training=False):
|
||||||
|
norm_x = self.layer_norm(hidden_states)
|
||||||
|
attention_output = self.SelfAttention(norm_x,
|
||||||
|
mask=attention_mask,
|
||||||
|
position_bias=position_bias,
|
||||||
|
head_mask=head_mask,
|
||||||
|
training=training)
|
||||||
|
y = attention_output[0]
|
||||||
|
layer_output = hidden_states + self.dropout(y, training=training)
|
||||||
|
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFT5LayerCrossAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
|
super(TFT5LayerCrossAttention, self).__init__(**kwargs)
|
||||||
|
self.EncDecAttention = TFT5Attention(config,
|
||||||
|
has_relative_attention_bias=has_relative_attention_bias,
|
||||||
|
name='EncDecAttention')
|
||||||
|
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
|
||||||
|
name='layer_norm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
|
||||||
|
head_mask=None, training=False):
|
||||||
|
norm_x = self.layer_norm(hidden_states)
|
||||||
|
attention_output = self.EncDecAttention(norm_x,
|
||||||
|
mask=attention_mask,
|
||||||
|
kv=kv,
|
||||||
|
position_bias=position_bias,
|
||||||
|
head_mask=head_mask,
|
||||||
|
training=training)
|
||||||
|
y = attention_output[0]
|
||||||
|
layer_output = hidden_states + self.dropout(y, training=training)
|
||||||
|
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFT5Block(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
|
||||||
|
super(TFT5Block, self).__init__(**kwargs)
|
||||||
|
self.is_decoder = config.is_decoder
|
||||||
|
self.layer = []
|
||||||
|
self.layer.append(TFT5LayerSelfAttention(config,
|
||||||
|
has_relative_attention_bias=has_relative_attention_bias,
|
||||||
|
name='layer_._0'))
|
||||||
|
if self.is_decoder:
|
||||||
|
self.layer.append(TFT5LayerCrossAttention(config,
|
||||||
|
has_relative_attention_bias=has_relative_attention_bias,
|
||||||
|
name='layer_._1'))
|
||||||
|
self.layer.append(TFT5LayerFF(config, name='layer_._2'))
|
||||||
|
else:
|
||||||
|
self.layer.append(TFT5LayerFF(config, name='layer_._1'))
|
||||||
|
|
||||||
|
def call(self, hidden_states, attention_mask=None, position_bias=None,
|
||||||
|
encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
|
||||||
|
head_mask=None, training=False):
|
||||||
|
self_attention_outputs = self.layer[0](hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_bias=position_bias,
|
||||||
|
head_mask=head_mask,
|
||||||
|
training=training)
|
||||||
|
hidden_states = self_attention_outputs[0]
|
||||||
|
outputs = self_attention_outputs[1:]
|
||||||
|
|
||||||
|
if not self.is_decoder:
|
||||||
|
hidden_states = self.layer[1](hidden_states, training=training)
|
||||||
|
else:
|
||||||
|
cross_attention_outputs = self.layer[1](hidden_states,
|
||||||
|
kv=encoder_hidden_states,
|
||||||
|
attention_mask=encoder_attention_mask,
|
||||||
|
position_bias=encoder_decoder_position_bias,
|
||||||
|
head_mask=head_mask,
|
||||||
|
training=training)
|
||||||
|
hidden_states = cross_attention_outputs[0]
|
||||||
|
outputs = outputs + cross_attention_outputs[1:]
|
||||||
|
hidden_states = self.layer[2](hidden_states, training=training)
|
||||||
|
|
||||||
|
outputs = (hidden_states,) + outputs # add attentions if we output them
|
||||||
|
return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# The full model without a specific pretrained or finetuning head is
|
||||||
|
# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
|
||||||
|
####################################################
|
||||||
|
class TFT5MainLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFT5MainLayer, self).__init__(**kwargs)
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.is_decoder = config.is_decoder
|
||||||
|
self.config = config
|
||||||
|
self.num_hidden_layers = config.num_layers
|
||||||
|
|
||||||
|
self.block = [TFT5Block(config,
|
||||||
|
has_relative_attention_bias=bool(i == 0),
|
||||||
|
name='block_._{}'.format(i))
|
||||||
|
for i in range(config.num_layers)]
|
||||||
|
self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
|
||||||
|
name='final_layer_norm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
|
||||||
|
|
||||||
|
def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None,
|
||||||
|
encoder_attention_mask=None, head_mask=None, training=False):
|
||||||
|
|
||||||
|
batch_size, seq_length = shape_list(hidden_states)[:2]
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = tf.fill((batch_size, seq_length), 1)
|
||||||
|
if self.is_decoder and encoder_attention_mask is None:
|
||||||
|
encoder_seq_length = encoder_hidden_states.shape[1]
|
||||||
|
encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1)
|
||||||
|
|
||||||
|
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
||||||
|
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||||
|
attention_mask = tf.cast(attention_mask, dtype=tf.float32)
|
||||||
|
num_dims_attention_mask = len(shape_list(attention_mask))
|
||||||
|
if num_dims_attention_mask == 3:
|
||||||
|
extended_attention_mask = attention_mask[:, None, :, :]
|
||||||
|
elif num_dims_attention_mask == 2:
|
||||||
|
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||||
|
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||||
|
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
|
if self.config.is_decoder:
|
||||||
|
seq_ids = tf.range(seq_length)
|
||||||
|
causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)),
|
||||||
|
seq_ids[None, :, None])
|
||||||
|
causal_mask = tf.cast(causal_mask, dtype=tf.float32)
|
||||||
|
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
||||||
|
else:
|
||||||
|
extended_attention_mask = attention_mask[:, None, None, :]
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
|
||||||
|
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
|
||||||
|
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
|
||||||
|
# extended_attention_mask = tf.math.equal(extended_attention_mask,
|
||||||
|
# tf.transpose(extended_attention_mask, perm=(-1, -2)))
|
||||||
|
|
||||||
|
extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
|
||||||
|
|
||||||
|
if self.is_decoder:
|
||||||
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
|
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||||
|
encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
|
||||||
|
num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
|
||||||
|
if num_dims_encoder_attention_mask == 3:
|
||||||
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||||
|
if num_dims_encoder_attention_mask == 2:
|
||||||
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
||||||
|
|
||||||
|
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
|
||||||
|
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
|
||||||
|
# encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
|
||||||
|
# tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
|
||||||
|
|
||||||
|
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
|
||||||
|
else:
|
||||||
|
encoder_extended_attention_mask = None
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if not head_mask is None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||||
|
|
||||||
|
all_hidden_states = ()
|
||||||
|
all_attentions = ()
|
||||||
|
position_bias = None
|
||||||
|
encoder_decoder_position_bias = None
|
||||||
|
for i, layer_module in enumerate(self.block):
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
layer_outputs = layer_module(hidden_states,
|
||||||
|
attention_mask=extended_attention_mask,
|
||||||
|
position_bias=position_bias,
|
||||||
|
encoder_hidden_states=encoder_hidden_states,
|
||||||
|
encoder_attention_mask=encoder_extended_attention_mask,
|
||||||
|
encoder_decoder_position_bias=encoder_decoder_position_bias,
|
||||||
|
head_mask=head_mask[i],
|
||||||
|
training=training)
|
||||||
|
hidden_states = layer_outputs[0]
|
||||||
|
if i == 0:
|
||||||
|
# We share the position biases between the layers - the first layer store them
|
||||||
|
# layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
|
||||||
|
position_bias = layer_outputs[2 if self.output_attentions else 1]
|
||||||
|
if self.is_decoder:
|
||||||
|
encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions = all_attentions + (layer_outputs[1],)
|
||||||
|
|
||||||
|
hidden_states = self.final_layer_norm(hidden_states)
|
||||||
|
layer_output = self.dropout(hidden_states, training=training)
|
||||||
|
|
||||||
|
# Add last layer
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# TFT5PreTrainedModel is a sub-class of tf.keras.Model
|
||||||
|
# which take care of loading and saving pretrained weights
|
||||||
|
# and various common utilities.
|
||||||
|
# Here you just need to specify a few (self-explanatory)
|
||||||
|
# pointers for your model.
|
||||||
|
####################################################
|
||||||
|
class TFT5PreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = T5Config
|
||||||
|
pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dummy_inputs(self):
|
||||||
|
input_ids = tf.constant(DUMMY_INPUTS)
|
||||||
|
input_mask = tf.constant(DUMMY_MASK)
|
||||||
|
dummy_inputs = {'decoder_input_ids': input_ids,
|
||||||
|
'encoder_input_ids': input_ids,
|
||||||
|
'decoder_attention_mask': input_mask}
|
||||||
|
return dummy_inputs
|
||||||
|
|
||||||
|
|
||||||
|
T5_START_DOCSTRING = r""" The T5 model was proposed in
|
||||||
|
`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
|
||||||
|
by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
|
||||||
|
It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
|
||||||
|
https://arxiv.org/abs/1910.10683
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
T5_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
|
|
||||||
|
T5 is a model with relative position embeddings so you should be able to pad the inputs on
|
||||||
|
the right or the left.
|
||||||
|
|
||||||
|
Indices can be obtained using :class:`transformers.T5Tokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
|
||||||
|
"without any specific head on top.",
|
||||||
|
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
|
||||||
|
class TFT5Model(TFT5PreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import T5Tokenizer, TFT5Model
|
||||||
|
|
||||||
|
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||||
|
model = TFT5Model.from_pretrained('t5-small')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids=input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFT5Model, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
|
||||||
|
name='shared')
|
||||||
|
|
||||||
|
encoder_config = copy.deepcopy(config)
|
||||||
|
self.encoder = TFT5MainLayer(encoder_config, name='encoder')
|
||||||
|
|
||||||
|
decoder_config = copy.deepcopy(config)
|
||||||
|
decoder_config.is_decoder = True
|
||||||
|
self.decoder = TFT5MainLayer(decoder_config, name='decoder')
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.shared
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.shared
|
||||||
|
|
||||||
|
def call(self, decoder_input_ids, **kwargs):
|
||||||
|
# We allow two types of multi-inputs:
|
||||||
|
# - traditional keyword arguments in the call method
|
||||||
|
# - all the arguments provided as a dict in the first positional argument of call
|
||||||
|
# The last option is useful to use the tf.keras fit() method.
|
||||||
|
|
||||||
|
if isinstance(decoder_input_ids, dict):
|
||||||
|
kwargs.update(decoder_input_ids)
|
||||||
|
else:
|
||||||
|
kwargs['decoder_input_ids'] = decoder_input_ids
|
||||||
|
|
||||||
|
kwargs_common = dict((k, v) for k, v in kwargs.items()
|
||||||
|
if not k.startswith("encoder_") and not k.startswith("decoder_"))
|
||||||
|
kwargs_encoder = kwargs_common.copy()
|
||||||
|
kwargs_decoder = kwargs_common.copy()
|
||||||
|
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
|
||||||
|
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
|
||||||
|
|
||||||
|
# Encode if needed (training, first prediction pass)
|
||||||
|
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
|
||||||
|
if encoder_hidden_states is None:
|
||||||
|
# Convert encoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
|
||||||
|
|
||||||
|
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
|
||||||
|
encoder_hidden_states = encoder_outputs[0]
|
||||||
|
else:
|
||||||
|
encoder_outputs = ()
|
||||||
|
|
||||||
|
# Decode
|
||||||
|
# Convert decoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(decoder_inputs_ids)
|
||||||
|
|
||||||
|
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
|
||||||
|
kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
|
||||||
|
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
|
||||||
|
|
||||||
|
return decoder_outputs + encoder_outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
|
||||||
|
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
|
||||||
|
class TFT5WithLMHeadModel(TFT5PreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import T5Tokenizer, TFT5WithLMHeadModel
|
||||||
|
|
||||||
|
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||||
|
model = TFT5WithLMHeadModel.from_pretrained('t5-small')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids=input_ids)
|
||||||
|
prediction_scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.model_dim = config.d_model
|
||||||
|
|
||||||
|
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
|
||||||
|
name='shared')
|
||||||
|
|
||||||
|
encoder_config = copy.deepcopy(config)
|
||||||
|
self.encoder = TFT5MainLayer(encoder_config, name='encoder')
|
||||||
|
|
||||||
|
decoder_config = copy.deepcopy(config)
|
||||||
|
decoder_config.is_decoder = True
|
||||||
|
self.decoder = TFT5MainLayer(decoder_config, name='decoder')
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.shared
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.shared
|
||||||
|
|
||||||
|
def call(self, decoder_input_ids, **kwargs):
|
||||||
|
# We allow two types of multi-inputs:
|
||||||
|
# - traditional keyword arguments in the call method
|
||||||
|
# - all the arguments provided as a dict in the first positional argument of call
|
||||||
|
# The last option is useful to use the tf.keras fit() method.
|
||||||
|
|
||||||
|
if isinstance(decoder_input_ids, dict):
|
||||||
|
kwargs.update(decoder_input_ids)
|
||||||
|
else:
|
||||||
|
kwargs['decoder_input_ids'] = decoder_input_ids
|
||||||
|
|
||||||
|
kwargs_common = dict((k, v) for k, v in kwargs.items()
|
||||||
|
if not k.startswith("encoder_") and not k.startswith("decoder_"))
|
||||||
|
kwargs_encoder = kwargs_common.copy()
|
||||||
|
kwargs_decoder = kwargs_common.copy()
|
||||||
|
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
|
||||||
|
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
|
||||||
|
|
||||||
|
# Encode if needed (training, first prediction pass)
|
||||||
|
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
|
||||||
|
if encoder_hidden_states is None:
|
||||||
|
# Convert encoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
|
||||||
|
|
||||||
|
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
|
||||||
|
encoder_hidden_states = encoder_outputs[0]
|
||||||
|
else:
|
||||||
|
encoder_outputs = ()
|
||||||
|
|
||||||
|
# Decode
|
||||||
|
# Convert decoder inputs in embeddings if needed
|
||||||
|
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
|
||||||
|
if hidden_states is None:
|
||||||
|
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
|
||||||
|
hidden_states = self.shared(decoder_inputs_ids)
|
||||||
|
|
||||||
|
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
|
||||||
|
kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
|
||||||
|
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
|
||||||
|
|
||||||
|
sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
|
||||||
|
lm_logits = self.shared(sequence_output, mode="linear")
|
||||||
|
decoder_outputs = (lm_logits,) + decoder_outputs[1:]
|
||||||
|
|
||||||
|
return decoder_outputs + encoder_outputs
|
||||||
@@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
|||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
self.n_token = config.n_token
|
self.n_token = config.vocab_size
|
||||||
|
|
||||||
self.d_embed = config.d_embed
|
self.d_embed = config.d_embed
|
||||||
self.d_model = config.d_model
|
self.d_model = config.d_model
|
||||||
@@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
|
|||||||
self.d_head = config.d_head
|
self.d_head = config.d_head
|
||||||
self.untie_r = config.untie_r
|
self.untie_r = config.untie_r
|
||||||
|
|
||||||
self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
|
||||||
div_val=config.div_val, init_std=config.init_std, name='word_emb')
|
div_val=config.div_val, init_std=config.init_std, name='word_emb')
|
||||||
|
|
||||||
self.drop = tf.keras.layers.Dropout(config.dropout)
|
self.drop = tf.keras.layers.Dropout(config.dropout)
|
||||||
@@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
# use adaptive softmax (including standard softmax)
|
# use adaptive softmax (including standard softmax)
|
||||||
else:
|
else:
|
||||||
self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model,
|
self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model,
|
||||||
config.cutoffs, div_val=config.div_val, name='crit')
|
config.cutoffs, div_val=config.div_val, name='crit')
|
||||||
|
|
||||||
def reset_length(self, tgt_len, ext_len, mem_len):
|
def reset_length(self, tgt_len, ext_len, mem_len):
|
||||||
|
|||||||
@@ -25,15 +25,15 @@ import tensorflow as tf
|
|||||||
from .modeling_tf_utils import shape_list
|
from .modeling_tf_utils import shape_list
|
||||||
|
|
||||||
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
|
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
|
||||||
keep_order=False, **kwargs):
|
keep_order=False, **kwargs):
|
||||||
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
|
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
|
||||||
|
|
||||||
self.n_token = n_token
|
self.vocab_size = vocab_size
|
||||||
self.d_embed = d_embed
|
self.d_embed = d_embed
|
||||||
self.d_proj = d_proj
|
self.d_proj = d_proj
|
||||||
|
|
||||||
self.cutoffs = cutoffs + [n_token]
|
self.cutoffs = cutoffs + [vocab_size]
|
||||||
self.cutoff_ends = [0] + self.cutoffs
|
self.cutoff_ends = [0] + self.cutoffs
|
||||||
self.div_val = div_val
|
self.div_val = div_val
|
||||||
|
|
||||||
@@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
self.out_projs.append(weight)
|
self.out_projs.append(weight)
|
||||||
else:
|
else:
|
||||||
self.out_projs.append(None)
|
self.out_projs.append(None)
|
||||||
weight = self.add_weight(shape=(self.n_token, self.d_embed,),
|
weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
|
||||||
initializer='zeros',
|
initializer='zeros',
|
||||||
trainable=True,
|
trainable=True,
|
||||||
name='out_layers_._{}_._weight'.format(i))
|
name='out_layers_._{}_._weight'.format(i))
|
||||||
bias = self.add_weight(shape=(self.n_token,),
|
bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
initializer='zeros',
|
initializer='zeros',
|
||||||
trainable=True,
|
trainable=True,
|
||||||
name='out_layers_._{}_._bias'.format(i))
|
name='out_layers_._{}_._bias'.format(i))
|
||||||
@@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
|||||||
hidden, target = inputs
|
hidden, target = inputs
|
||||||
head_logprob = 0
|
head_logprob = 0
|
||||||
if self.n_clusters == 0:
|
if self.n_clusters == 0:
|
||||||
softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
|
softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
|
||||||
output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
|
output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
|
||||||
if target is not None:
|
if target is not None:
|
||||||
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
|
||||||
|
|||||||
@@ -26,13 +26,12 @@ from tensorflow.python.keras.saving import hdf5_format
|
|||||||
import h5py
|
import h5py
|
||||||
|
|
||||||
from .configuration_utils import PretrainedConfig
|
from .configuration_utils import PretrainedConfig
|
||||||
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
|
from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
|
||||||
|
cached_path, hf_bucket_url, is_remote_url)
|
||||||
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
|
|
||||||
|
|
||||||
class TFPreTrainedModel(tf.keras.Model):
|
class TFPreTrainedModel(tf.keras.Model):
|
||||||
r""" Base class for all TF models.
|
r""" Base class for all TF models.
|
||||||
|
|
||||||
@@ -61,7 +60,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
Returns:
|
Returns:
|
||||||
tf.Tensor with dummy inputs
|
tf.Tensor with dummy inputs
|
||||||
"""
|
"""
|
||||||
return tf.constant(DUMMY_INPUTS)
|
return {'input_ids': tf.constant(DUMMY_INPUTS)}
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
|
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||||
@@ -178,6 +177,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
|
- a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
|
||||||
|
|
||||||
@@ -263,12 +263,14 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
|
raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
|
||||||
[WEIGHTS_NAME, TF2_WEIGHTS_NAME],
|
[WEIGHTS_NAME, TF2_WEIGHTS_NAME],
|
||||||
pretrained_model_name_or_path))
|
pretrained_model_name_or_path))
|
||||||
elif os.path.isfile(pretrained_model_name_or_path):
|
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||||
archive_file = pretrained_model_name_or_path
|
archive_file = pretrained_model_name_or_path
|
||||||
elif os.path.isfile(pretrained_model_name_or_path + ".index"):
|
elif os.path.isfile(pretrained_model_name_or_path + ".index"):
|
||||||
archive_file = pretrained_model_name_or_path + ".index"
|
archive_file = pretrained_model_name_or_path + ".index"
|
||||||
else:
|
else:
|
||||||
archive_file = pretrained_model_name_or_path
|
archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
|
||||||
|
if from_pt:
|
||||||
|
raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
|
||||||
|
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
@@ -301,7 +303,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
|||||||
|
|
||||||
if from_pt:
|
if from_pt:
|
||||||
# Load from a PyTorch checkpoint
|
# Load from a PyTorch checkpoint
|
||||||
return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file)
|
return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
|
||||||
|
|
||||||
ret = model(model.dummy_inputs, training=False) # build the network with dummy inputs
|
ret = model(model.dummy_inputs, training=False) # build the network with dummy inputs
|
||||||
|
|
||||||
|
|||||||
@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
|
|||||||
langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
||||||
else:
|
else:
|
||||||
langs_list = None
|
langs_list = None
|
||||||
return [inputs_list, attns_list, langs_list]
|
return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
|
||||||
|
|
||||||
|
|
||||||
XLM_START_DOCSTRING = r""" The XLM model was proposed in
|
XLM_START_DOCSTRING = r""" The XLM model was proposed in
|
||||||
|
|||||||
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
|||||||
self.use_bfloat16 = config.use_bfloat16
|
self.use_bfloat16 = config.use_bfloat16
|
||||||
self.initializer_range = config.initializer_range
|
self.initializer_range = config.initializer_range
|
||||||
|
|
||||||
self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
|
self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
|
||||||
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
|
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
|
||||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
|
||||||
|
|||||||
@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
self.n_token = config.n_token
|
self.n_token = config.vocab_size
|
||||||
|
|
||||||
self.d_embed = config.d_embed
|
self.d_embed = config.d_embed
|
||||||
self.d_model = config.d_model
|
self.d_model = config.d_model
|
||||||
self.n_head = config.n_head
|
self.n_head = config.n_head
|
||||||
self.d_head = config.d_head
|
self.d_head = config.d_head
|
||||||
|
|
||||||
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
|
||||||
div_val=config.div_val)
|
div_val=config.div_val)
|
||||||
|
|
||||||
self.drop = nn.Dropout(config.dropout)
|
self.drop = nn.Dropout(config.dropout)
|
||||||
@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
|||||||
self.sample_softmax = config.sample_softmax
|
self.sample_softmax = config.sample_softmax
|
||||||
# use sampled softmax
|
# use sampled softmax
|
||||||
if config.sample_softmax > 0:
|
if config.sample_softmax > 0:
|
||||||
self.out_layer = nn.Linear(config.d_model, config.n_token)
|
self.out_layer = nn.Linear(config.d_model, config.vocab_size)
|
||||||
self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
|
self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
|
||||||
# use adaptive softmax (including standard softmax)
|
# use adaptive softmax (including standard softmax)
|
||||||
else:
|
else:
|
||||||
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
|
self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
|
||||||
config.cutoffs, div_val=config.div_val)
|
config.cutoffs, div_val=config.div_val)
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
|
|||||||
@@ -31,11 +31,11 @@ from torch.nn import CrossEntropyLoss
|
|||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from .configuration_utils import PretrainedConfig
|
from .configuration_utils import PretrainedConfig
|
||||||
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
|
from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
|
||||||
|
cached_path, hf_bucket_url, is_remote_url)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from torch.nn import Identity
|
from torch.nn import Identity
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -71,6 +71,15 @@ class PreTrainedModel(nn.Module):
|
|||||||
load_tf_weights = lambda model, config, path: None
|
load_tf_weights = lambda model, config, path: None
|
||||||
base_model_prefix = ""
|
base_model_prefix = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dummy_inputs(self):
|
||||||
|
""" Dummy inputs to do a forward pass in the network.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
torch.Tensor with dummy inputs
|
||||||
|
"""
|
||||||
|
return {'input_ids': torch.tensor(DUMMY_INPUTS)}
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(PreTrainedModel, self).__init__()
|
super(PreTrainedModel, self).__init__()
|
||||||
if not isinstance(config, PretrainedConfig):
|
if not isinstance(config, PretrainedConfig):
|
||||||
@@ -160,8 +169,7 @@ class PreTrainedModel(nn.Module):
|
|||||||
base_model.vocab_size = new_num_tokens
|
base_model.vocab_size = new_num_tokens
|
||||||
|
|
||||||
# Tie weights again if needed
|
# Tie weights again if needed
|
||||||
if hasattr(self, 'tie_weights'):
|
self.tie_weights()
|
||||||
self.tie_weights()
|
|
||||||
|
|
||||||
return model_embeds
|
return model_embeds
|
||||||
|
|
||||||
@@ -265,6 +273,7 @@ class PreTrainedModel(nn.Module):
|
|||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
|
||||||
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
- None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
|
- None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
|
||||||
@@ -318,10 +327,6 @@ class PreTrainedModel(nn.Module):
|
|||||||
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
|
|
||||||
logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
|
|
||||||
"https://github.com/google-research/google-research/issues/119 for more information.")
|
|
||||||
|
|
||||||
config = kwargs.pop('config', None)
|
config = kwargs.pop('config', None)
|
||||||
state_dict = kwargs.pop('state_dict', None)
|
state_dict = kwargs.pop('state_dict', None)
|
||||||
cache_dir = kwargs.pop('cache_dir', None)
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
@@ -362,14 +367,16 @@ class PreTrainedModel(nn.Module):
|
|||||||
raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
|
raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
|
||||||
[WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
|
[WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
|
||||||
pretrained_model_name_or_path))
|
pretrained_model_name_or_path))
|
||||||
elif os.path.isfile(pretrained_model_name_or_path):
|
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||||
archive_file = pretrained_model_name_or_path
|
archive_file = pretrained_model_name_or_path
|
||||||
elif os.path.isfile(pretrained_model_name_or_path + ".index"):
|
elif os.path.isfile(pretrained_model_name_or_path + ".index"):
|
||||||
assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
|
assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
|
||||||
pretrained_model_name_or_path + ".index")
|
pretrained_model_name_or_path + ".index")
|
||||||
archive_file = pretrained_model_name_or_path + ".index"
|
archive_file = pretrained_model_name_or_path + ".index"
|
||||||
else:
|
else:
|
||||||
archive_file = pretrained_model_name_or_path
|
archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
|
||||||
|
if from_tf:
|
||||||
|
raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
|
||||||
|
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
@@ -473,8 +480,7 @@ class PreTrainedModel(nn.Module):
|
|||||||
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
|
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
|
||||||
model.__class__.__name__, "\n\t".join(error_msgs)))
|
model.__class__.__name__, "\n\t".join(error_msgs)))
|
||||||
|
|
||||||
if hasattr(model, 'tie_weights'):
|
model.tie_weights() # make sure word embedding weights are still tied if needed
|
||||||
model.tie_weights() # make sure word embedding weights are still tied
|
|
||||||
|
|
||||||
# Set model in evaluation mode to desactivate DropOut modules by default
|
# Set model in evaluation mode to desactivate DropOut modules by default
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|||||||
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
|
|||||||
def __init__(self, *inputs, **kwargs):
|
def __init__(self, *inputs, **kwargs):
|
||||||
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
|
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dummy_inputs(self):
|
||||||
|
inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
|
||||||
|
attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
||||||
|
if self.config.use_lang_emb and self.config.n_langs > 1:
|
||||||
|
langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
||||||
|
else:
|
||||||
|
langs_list = None
|
||||||
|
return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights. """
|
""" Initialize the weights. """
|
||||||
if isinstance(module, nn.Embedding):
|
if isinstance(module, nn.Embedding):
|
||||||
@@ -646,7 +656,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
langs=langs,
|
langs=langs,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
lengths=lengths,
|
lengths=lengths,
|
||||||
cache=cache,
|
cache=cache,
|
||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
inputs_embeds=inputs_embeds)
|
inputs_embeds=inputs_embeds)
|
||||||
|
|||||||
@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
self.clamp_len = config.clamp_len
|
self.clamp_len = config.clamp_len
|
||||||
self.n_layer = config.n_layer
|
self.n_layer = config.n_layer
|
||||||
|
|
||||||
self.word_embedding = nn.Embedding(config.n_token, config.d_model)
|
self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
|
||||||
self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
|
self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
|
||||||
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
||||||
self.dropout = nn.Dropout(config.dropout)
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
@@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
self.same_length = config.same_length
|
self.same_length = config.same_length
|
||||||
|
|
||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
|
||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
|
|||||||
@@ -16,15 +16,12 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import copy
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
|
||||||
import json
|
import json
|
||||||
import random
|
import tempfile
|
||||||
import uuid
|
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import logging
|
from .tokenization_tests_commons import TemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
class ConfigTester(object):
|
class ConfigTester(object):
|
||||||
@@ -48,16 +45,28 @@ class ConfigTester(object):
|
|||||||
|
|
||||||
def create_and_test_config_to_json_file(self):
|
def create_and_test_config_to_json_file(self):
|
||||||
config_first = self.config_class(**self.inputs_dict)
|
config_first = self.config_class(**self.inputs_dict)
|
||||||
json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
|
|
||||||
config_first.to_json_file(json_file_path)
|
with TemporaryDirectory() as tmpdirname:
|
||||||
config_second = self.config_class.from_json_file(json_file_path)
|
json_file_path = os.path.join(tmpdirname, "config.json")
|
||||||
os.remove(json_file_path)
|
config_first.to_json_file(json_file_path)
|
||||||
|
config_second = self.config_class.from_json_file(json_file_path)
|
||||||
|
|
||||||
|
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
|
def create_and_test_config_from_and_save_pretrained(self):
|
||||||
|
config_first = self.config_class(**self.inputs_dict)
|
||||||
|
|
||||||
|
with TemporaryDirectory() as tmpdirname:
|
||||||
|
config_first.save_pretrained(tmpdirname)
|
||||||
|
config_second = self.config_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
|
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
def run_common_tests(self):
|
def run_common_tests(self):
|
||||||
self.create_and_test_config_common_properties()
|
self.create_and_test_config_common_properties()
|
||||||
self.create_and_test_config_to_json_string()
|
self.create_and_test_config_to_json_string()
|
||||||
self.create_and_test_config_to_json_file()
|
self.create_and_test_config_to_json_file()
|
||||||
|
self.create_and_test_config_from_and_save_pretrained()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
0
transformers/tests/fixtures/empty.txt
vendored
Normal file
0
transformers/tests/fixtures/empty.txt
vendored
Normal file
@@ -15,18 +15,30 @@
|
|||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import six
|
|
||||||
import time
|
import time
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
|
import requests
|
||||||
|
import six
|
||||||
|
|
||||||
|
from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
|
||||||
|
|
||||||
USER = "__DUMMY_TRANSFORMERS_USER__"
|
USER = "__DUMMY_TRANSFORMERS_USER__"
|
||||||
PASS = "__DUMMY_TRANSFORMERS_PASS__"
|
PASS = "__DUMMY_TRANSFORMERS_PASS__"
|
||||||
FILE_KEY = "Test-{}.txt".format(int(time.time()))
|
FILES = [
|
||||||
FILE_PATH = os.path.join(
|
(
|
||||||
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
|
"Test-{}.txt".format(int(time.time())),
|
||||||
)
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"yoyo {}.txt".format(int(time.time())), # space is intentional
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest):
|
|||||||
self.assertEqual(user, USER)
|
self.assertEqual(user, USER)
|
||||||
|
|
||||||
def test_presign(self):
|
def test_presign(self):
|
||||||
urls = self._api.presign(token=self._token, filename=FILE_KEY)
|
for FILE_KEY, FILE_PATH in FILES:
|
||||||
self.assertIsInstance(urls, PresignedUrl)
|
urls = self._api.presign(token=self._token, filename=FILE_KEY)
|
||||||
self.assertEqual(urls.type, "text/plain")
|
self.assertIsInstance(urls, PresignedUrl)
|
||||||
|
self.assertEqual(urls.type, "text/plain")
|
||||||
|
|
||||||
def test_presign_and_upload(self):
|
def test_presign_and_upload(self):
|
||||||
access_url = self._api.presign_and_upload(
|
for FILE_KEY, FILE_PATH in FILES:
|
||||||
token=self._token, filename=FILE_KEY, filepath=FILE_PATH
|
access_url = self._api.presign_and_upload(
|
||||||
)
|
token=self._token, filename=FILE_KEY, filepath=FILE_PATH
|
||||||
self.assertIsInstance(access_url, six.string_types)
|
)
|
||||||
|
self.assertIsInstance(access_url, six.string_types)
|
||||||
|
with open(FILE_PATH, 'r') as f:
|
||||||
|
body = f.read()
|
||||||
|
r = requests.get(access_url)
|
||||||
|
self.assertEqual(r.text, body)
|
||||||
|
|
||||||
def test_list_objs(self):
|
def test_list_objs(self):
|
||||||
objs = self._api.list_objs(token=self._token)
|
objs = self._api.list_objs(token=self._token)
|
||||||
|
|||||||
89
transformers/tests/model_card_test.py
Normal file
89
transformers/tests/model_card_test.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019 HuggingFace Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from transformers.model_card import ModelCard
|
||||||
|
from .tokenization_tests_commons import TemporaryDirectory
|
||||||
|
|
||||||
|
class ModelCardTester(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.inputs_dict = {'model_details': {
|
||||||
|
'Organization': 'testing',
|
||||||
|
'Model date': 'today',
|
||||||
|
'Model version': 'v2.1, Developed by Test Corp in 2019.',
|
||||||
|
'Architecture': 'Convolutional Neural Network.',
|
||||||
|
},
|
||||||
|
'metrics': 'BLEU and ROUGE-1',
|
||||||
|
'evaluation_data':{
|
||||||
|
'Datasets':{
|
||||||
|
'BLEU': 'My-great-dataset-v1',
|
||||||
|
'ROUGE-1': 'My-short-dataset-v2.1',
|
||||||
|
},
|
||||||
|
'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
|
||||||
|
},
|
||||||
|
'training_data':{
|
||||||
|
'Dataset': 'English Wikipedia dump dated 2018-12-01',
|
||||||
|
'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
|
||||||
|
},
|
||||||
|
'quantitative_analyses': {
|
||||||
|
'BLEU': 55.1,
|
||||||
|
'ROUGE-1': 76,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_model_card_common_properties(self):
|
||||||
|
model_card = ModelCard.from_dict(self.inputs_dict)
|
||||||
|
self.assertTrue(hasattr(model_card, 'model_details'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'intended_use'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'factors'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'metrics'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'evaluation_data'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'training_data'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'quantitative_analyses'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'ethical_considerations'))
|
||||||
|
self.assertTrue(hasattr(model_card, 'caveats_and_recommendations'))
|
||||||
|
|
||||||
|
def test_model_card_to_json_string(self):
|
||||||
|
model_card = ModelCard.from_dict(self.inputs_dict)
|
||||||
|
obj = json.loads(model_card.to_json_string())
|
||||||
|
for key, value in self.inputs_dict.items():
|
||||||
|
self.assertEqual(obj[key], value)
|
||||||
|
|
||||||
|
def test_model_card_to_json_file(self):
|
||||||
|
model_card_first = ModelCard.from_dict(self.inputs_dict)
|
||||||
|
|
||||||
|
with TemporaryDirectory() as tmpdirname:
|
||||||
|
filename = os.path.join(tmpdirname, u"model_card.json")
|
||||||
|
model_card_first.to_json_file(filename)
|
||||||
|
model_card_second = ModelCard.from_json_file(filename)
|
||||||
|
|
||||||
|
self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
|
||||||
|
|
||||||
|
def test_model_card_from_and_save_pretrained(self):
|
||||||
|
model_card_first = ModelCard.from_dict(self.inputs_dict)
|
||||||
|
|
||||||
|
with TemporaryDirectory() as tmpdirname:
|
||||||
|
model_card_first.save_pretrained(tmpdirname)
|
||||||
|
model_card_second = ModelCard.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -110,7 +110,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = AlbertConfig(
|
config = AlbertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import logging
|
|||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
from .utils import require_torch, slow
|
from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (AutoConfig, BertConfig,
|
from transformers import (AutoConfig, BertConfig,
|
||||||
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
|
|||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
self.assertIsInstance(model, BertForQuestionAnswering)
|
self.assertIsInstance(model, BertForQuestionAnswering)
|
||||||
|
|
||||||
|
def test_from_pretrained_identifier(self):
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
|
||||||
|
self.assertIsInstance(model, BertForMaskedLM)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = BertConfig(
|
config = BertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ else:
|
|||||||
def _config_zero_init(config):
|
def _config_zero_init(config):
|
||||||
configs_no_init = copy.deepcopy(config)
|
configs_no_init = copy.deepcopy(config)
|
||||||
for key in configs_no_init.__dict__.keys():
|
for key in configs_no_init.__dict__.keys():
|
||||||
if '_range' in key or '_std' in key:
|
if '_range' in key or '_std' in key or 'initializer_factor' in key:
|
||||||
setattr(configs_no_init, key, 0.0)
|
setattr(configs_no_init, key, 0.0)
|
||||||
return configs_no_init
|
return configs_no_init
|
||||||
|
|
||||||
@@ -73,6 +73,7 @@ class CommonTestCases:
|
|||||||
test_pruning = True
|
test_pruning = True
|
||||||
test_resize_embeddings = True
|
test_resize_embeddings = True
|
||||||
test_head_masking = True
|
test_head_masking = True
|
||||||
|
is_encoder_decoder = False
|
||||||
|
|
||||||
def test_save_load(self):
|
def test_save_load(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
@@ -83,6 +84,8 @@ class CommonTestCases:
|
|||||||
model.eval()
|
model.eval()
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model(**inputs_dict)
|
outputs = model(**inputs_dict)
|
||||||
|
out_2 = outputs[0].numpy()
|
||||||
|
out_2[np.isnan(out_2)] = 0
|
||||||
|
|
||||||
with TemporaryDirectory() as tmpdirname:
|
with TemporaryDirectory() as tmpdirname:
|
||||||
model.save_pretrained(tmpdirname)
|
model.save_pretrained(tmpdirname)
|
||||||
@@ -93,9 +96,7 @@ class CommonTestCases:
|
|||||||
|
|
||||||
# Make sure we don't have nans
|
# Make sure we don't have nans
|
||||||
out_1 = after_outputs[0].cpu().numpy()
|
out_1 = after_outputs[0].cpu().numpy()
|
||||||
out_2 = outputs[0].cpu().numpy()
|
out_1[np.isnan(out_1)] = 0
|
||||||
out_1 = out_1[~np.isnan(out_1)]
|
|
||||||
out_2 = out_2[~np.isnan(out_2)]
|
|
||||||
max_diff = np.amax(np.abs(out_1 - out_2))
|
max_diff = np.amax(np.abs(out_1 - out_2))
|
||||||
self.assertLessEqual(max_diff, 1e-5)
|
self.assertLessEqual(max_diff, 1e-5)
|
||||||
|
|
||||||
@@ -117,20 +118,32 @@ class CommonTestCases:
|
|||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
|
with torch.no_grad():
|
||||||
self.assertEqual(first.ne(second).sum().item(), 0)
|
first = model(**inputs_dict)[0]
|
||||||
|
second = model(**inputs_dict)[0]
|
||||||
|
out_1 = first.cpu().numpy()
|
||||||
|
out_2 = second.cpu().numpy()
|
||||||
|
out_1 = out_1[~np.isnan(out_1)]
|
||||||
|
out_2 = out_2[~np.isnan(out_2)]
|
||||||
|
max_diff = np.amax(np.abs(out_1 - out_2))
|
||||||
|
self.assertLessEqual(max_diff, 1e-5)
|
||||||
|
|
||||||
def test_attention_outputs(self):
|
def test_attention_outputs(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
|
||||||
|
encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
|
||||||
|
decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
|
||||||
|
encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = False
|
config.output_hidden_states = False
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
self.assertEqual(model.config.output_attentions, True)
|
self.assertEqual(model.config.output_attentions, True)
|
||||||
self.assertEqual(model.config.output_hidden_states, False)
|
self.assertEqual(model.config.output_hidden_states, False)
|
||||||
@@ -138,28 +151,42 @@ class CommonTestCases:
|
|||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
list(attentions[0].shape[-3:]),
|
list(attentions[0].shape[-3:]),
|
||||||
[self.model_tester.num_attention_heads,
|
[self.model_tester.num_attention_heads,
|
||||||
self.model_tester.seq_length,
|
encoder_seq_length ,
|
||||||
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
encoder_key_length])
|
||||||
out_len = len(outputs)
|
out_len = len(outputs)
|
||||||
|
|
||||||
|
if self.is_encoder_decoder:
|
||||||
|
self.assertEqual(out_len % 2, 0)
|
||||||
|
decoder_attentions = outputs[(out_len // 2)-1]
|
||||||
|
self.assertEqual(model.config.output_attentions, True)
|
||||||
|
self.assertEqual(model.config.output_hidden_states, False)
|
||||||
|
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
|
||||||
|
self.assertListEqual(
|
||||||
|
list(decoder_attentions[0].shape[-3:]),
|
||||||
|
[self.model_tester.num_attention_heads,
|
||||||
|
decoder_seq_length,
|
||||||
|
decoder_key_length
|
||||||
|
])
|
||||||
|
|
||||||
# Check attention is always last and order is fine
|
# Check attention is always last and order is fine
|
||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = True
|
config.output_hidden_states = True
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
self.assertEqual(out_len+1, len(outputs))
|
outputs = model(**inputs_dict)
|
||||||
|
self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
|
||||||
self.assertEqual(model.config.output_attentions, True)
|
self.assertEqual(model.config.output_attentions, True)
|
||||||
self.assertEqual(model.config.output_hidden_states, True)
|
self.assertEqual(model.config.output_hidden_states, True)
|
||||||
|
|
||||||
attentions = outputs[-1]
|
self_attentions = outputs[-1]
|
||||||
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
list(attentions[0].shape[-3:]),
|
list(self_attentions[0].shape[-3:]),
|
||||||
[self.model_tester.num_attention_heads,
|
[self.model_tester.num_attention_heads,
|
||||||
self.model_tester.seq_length,
|
encoder_seq_length,
|
||||||
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
encoder_key_length])
|
||||||
|
|
||||||
def test_torchscript(self):
|
def test_torchscript(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
@@ -223,7 +250,6 @@ class CommonTestCases:
|
|||||||
|
|
||||||
self.assertTrue(models_equal)
|
self.assertTrue(models_equal)
|
||||||
|
|
||||||
|
|
||||||
def test_headmasking(self):
|
def test_headmasking(self):
|
||||||
if not self.test_head_masking:
|
if not self.test_head_masking:
|
||||||
return
|
return
|
||||||
@@ -278,7 +304,6 @@ class CommonTestCases:
|
|||||||
self.assertNotEqual(
|
self.assertNotEqual(
|
||||||
attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
|
attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
|
||||||
|
|
||||||
|
|
||||||
def test_head_pruning(self):
|
def test_head_pruning(self):
|
||||||
if not self.test_pruning:
|
if not self.test_pruning:
|
||||||
return
|
return
|
||||||
@@ -297,7 +322,8 @@ class CommonTestCases:
|
|||||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||||
-1: [0]}
|
-1: [0]}
|
||||||
model.prune_heads(heads_to_prune)
|
model.prune_heads(heads_to_prune)
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
|
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
|
|
||||||
@@ -333,7 +359,8 @@ class CommonTestCases:
|
|||||||
model = model_class.from_pretrained(directory)
|
model = model_class.from_pretrained(directory)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
self.assertEqual(attentions[0].shape[-3], 1)
|
self.assertEqual(attentions[0].shape[-3], 1)
|
||||||
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
||||||
@@ -362,7 +389,8 @@ class CommonTestCases:
|
|||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
|
|
||||||
self.assertEqual(attentions[0].shape[-3], 1)
|
self.assertEqual(attentions[0].shape[-3], 1)
|
||||||
@@ -389,7 +417,8 @@ class CommonTestCases:
|
|||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
|
|
||||||
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||||
@@ -406,7 +435,8 @@ class CommonTestCases:
|
|||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
shutil.rmtree(directory)
|
shutil.rmtree(directory)
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
|
|
||||||
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||||
@@ -417,7 +447,8 @@ class CommonTestCases:
|
|||||||
heads_to_prune = {0: [0], 2: [1, 2]}
|
heads_to_prune = {0: [0], 2: [1, 2]}
|
||||||
model.prune_heads(heads_to_prune)
|
model.prune_heads(heads_to_prune)
|
||||||
|
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
attentions = outputs[-1]
|
attentions = outputs[-1]
|
||||||
|
|
||||||
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
|
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
|
||||||
@@ -427,7 +458,6 @@ class CommonTestCases:
|
|||||||
|
|
||||||
self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
|
self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
|
||||||
|
|
||||||
|
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
@@ -437,14 +467,16 @@ class CommonTestCases:
|
|||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(**inputs_dict)
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
hidden_states = outputs[-1]
|
hidden_states = outputs[-1]
|
||||||
self.assertEqual(model.config.output_attentions, False)
|
self.assertEqual(model.config.output_attentions, False)
|
||||||
self.assertEqual(model.config.output_hidden_states, True)
|
self.assertEqual(model.config.output_hidden_states, True)
|
||||||
self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
|
self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
list(hidden_states[0].shape[-2:]),
|
list(hidden_states[0].shape[-2:]),
|
||||||
[self.model_tester.seq_length, self.model_tester.hidden_size])
|
[self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
|
||||||
|
self.model_tester.hidden_size])
|
||||||
|
|
||||||
def test_resize_tokens_embeddings(self):
|
def test_resize_tokens_embeddings(self):
|
||||||
original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
@@ -550,8 +582,14 @@ class CommonTestCases:
|
|||||||
|
|
||||||
def test_inputs_embeds(self):
|
def test_inputs_embeds(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
input_ids = inputs_dict["input_ids"]
|
if not self.is_encoder_decoder:
|
||||||
del inputs_dict["input_ids"]
|
input_ids = inputs_dict["input_ids"]
|
||||||
|
del inputs_dict["input_ids"]
|
||||||
|
else:
|
||||||
|
encoder_input_ids = inputs_dict["encoder_input_ids"]
|
||||||
|
decoder_input_ids = inputs_dict["decoder_input_ids"]
|
||||||
|
del inputs_dict["encoder_input_ids"]
|
||||||
|
del inputs_dict["decoder_input_ids"]
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
@@ -559,9 +597,14 @@ class CommonTestCases:
|
|||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
wte = model.get_input_embeddings()
|
wte = model.get_input_embeddings()
|
||||||
inputs_dict["inputs_embeds"] = wte(input_ids)
|
if not self.is_encoder_decoder:
|
||||||
outputs = model(**inputs_dict)
|
inputs_dict["inputs_embeds"] = wte(input_ids)
|
||||||
|
else:
|
||||||
|
inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
|
||||||
|
inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
|
|
||||||
class GPTModelTester(CommonModelTester):
|
class GPTModelTester(CommonModelTester):
|
||||||
|
|
||||||
@@ -633,7 +676,7 @@ class CommonTestCases:
|
|||||||
mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
|
mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
|
||||||
|
|
||||||
config = self.config_class(
|
config = self.config_class(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_positions=self.n_positions,
|
n_positions=self.n_positions,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
@@ -649,9 +692,10 @@ class CommonTestCases:
|
|||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
outputs = model(input_ids, position_ids, token_type_ids)
|
with torch.no_grad():
|
||||||
outputs = model(input_ids, position_ids)
|
outputs = model(input_ids, position_ids, token_type_ids)
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids, position_ids)
|
||||||
|
outputs = model(input_ids)
|
||||||
|
|
||||||
hidden_state = outputs[0]
|
hidden_state = outputs[0]
|
||||||
self.parent.assertListEqual(
|
self.parent.assertListEqual(
|
||||||
@@ -664,7 +708,8 @@ class CommonTestCases:
|
|||||||
model = self.lm_head_model_class(config)
|
model = self.lm_head_model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
with torch.no_grad():
|
||||||
|
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||||
loss, lm_logits = outputs[:2]
|
loss, lm_logits = outputs[:2]
|
||||||
|
|
||||||
total_voc = self.vocab_size
|
total_voc = self.vocab_size
|
||||||
@@ -681,7 +726,8 @@ class CommonTestCases:
|
|||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(input_ids)
|
with torch.no_grad():
|
||||||
|
outputs = model(input_ids)
|
||||||
presents = outputs[-1]
|
presents = outputs[-1]
|
||||||
self.parent.assertEqual(self.num_hidden_layers, len(presents))
|
self.parent.assertEqual(self.num_hidden_layers, len(presents))
|
||||||
self.parent.assertListEqual(
|
self.parent.assertListEqual(
|
||||||
@@ -694,7 +740,8 @@ class CommonTestCases:
|
|||||||
model = self.double_head_model_class(config)
|
model = self.double_head_model_class(config)
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
|
with torch.no_grad():
|
||||||
|
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
|
||||||
token_type_ids=token_type_ids, position_ids=position_ids)
|
token_type_ids=token_type_ids, position_ids=position_ids)
|
||||||
lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
|
lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
|
||||||
loss = [lm_loss, mc_loss]
|
loss = [lm_loss, mc_loss]
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = CTRLConfig(
|
config = CTRLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = DistilBertConfig(
|
config = DistilBertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
dim=self.hidden_size,
|
dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
n_heads=self.num_attention_heads,
|
n_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = GPT2Config(
|
config = GPT2Config(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = OpenAIGPTConfig(
|
config = OpenAIGPTConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = RobertaConfig(
|
config = RobertaConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
185
transformers/tests/modeling_t5_test.py
Normal file
185
transformers/tests/modeling_t5_test.py
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from transformers import is_torch_available
|
||||||
|
|
||||||
|
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
from transformers import (T5Config, T5Model, T5WithLMHeadModel)
|
||||||
|
from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
class T5ModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
|
all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
|
||||||
|
test_pruning = False
|
||||||
|
test_torchscript = False
|
||||||
|
test_resize_embeddings = False
|
||||||
|
is_encoder_decoder = True
|
||||||
|
|
||||||
|
class T5ModelTester(object):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
encoder_seq_length=7,
|
||||||
|
decoder_seq_length=9,
|
||||||
|
is_training=True,
|
||||||
|
use_attention_mask=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
n_positions=14,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
d_ff=37,
|
||||||
|
relative_attention_num_buckets=8,
|
||||||
|
dropout_rate=0.1,
|
||||||
|
initializer_factor=0.002,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.encoder_seq_length = encoder_seq_length
|
||||||
|
self.decoder_seq_length = decoder_seq_length
|
||||||
|
self.is_training = is_training
|
||||||
|
self.use_attention_mask = use_attention_mask
|
||||||
|
self.use_labels = use_labels
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.n_positions = n_positions
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.d_ff = d_ff
|
||||||
|
self.relative_attention_num_buckets = relative_attention_num_buckets
|
||||||
|
self.dropout_rate = dropout_rate
|
||||||
|
self.initializer_factor = initializer_factor
|
||||||
|
self.scope = scope
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
|
||||||
|
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
encoder_attention_mask = None
|
||||||
|
decoder_attention_mask = None
|
||||||
|
if self.use_attention_mask:
|
||||||
|
encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
|
||||||
|
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
decoder_lm_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
config = T5Config(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_positions=self.n_positions,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
d_ff=self.d_ff,
|
||||||
|
d_kv=self.hidden_size // self.num_attention_heads,
|
||||||
|
num_layers=self.num_hidden_layers,
|
||||||
|
num_heads=self.num_attention_heads,
|
||||||
|
relative_attention_num_buckets=self.relative_attention_num_buckets,
|
||||||
|
dropout_rate=self.dropout_rate,
|
||||||
|
initializer_factor=self.initializer_factor)
|
||||||
|
|
||||||
|
return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
|
||||||
|
def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
|
||||||
|
model = T5Model(config=config)
|
||||||
|
model.eval()
|
||||||
|
decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
|
||||||
|
decoder_input_ids=decoder_input_ids,
|
||||||
|
encoder_attention_mask=encoder_attention_mask,
|
||||||
|
decoder_attention_mask=decoder_attention_mask)
|
||||||
|
decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
|
||||||
|
decoder_input_ids=decoder_input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"encoder_output": encoder_output,
|
||||||
|
"decoder_output": decoder_output,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["encoder_output"].size()),
|
||||||
|
[self.batch_size, self.encoder_seq_length, self.hidden_size])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["decoder_output"].size()),
|
||||||
|
[self.batch_size, self.decoder_seq_length, self.hidden_size])
|
||||||
|
|
||||||
|
|
||||||
|
def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
|
||||||
|
model = T5WithLMHeadModel(config=config)
|
||||||
|
model.eval()
|
||||||
|
outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
|
||||||
|
decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
|
||||||
|
loss, prediction_scores = outputs[0], outputs[1]
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"prediction_scores": prediction_scores,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].size()),
|
||||||
|
[self.batch_size, self.decoder_seq_length, self.vocab_size])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
|
||||||
|
decoder_attention_mask, decoder_lm_labels) = config_and_inputs
|
||||||
|
inputs_dict = {'encoder_input_ids': encoder_input_ids,
|
||||||
|
'decoder_input_ids': decoder_input_ids,
|
||||||
|
'decoder_attention_mask': decoder_attention_mask,
|
||||||
|
'encoder_attention_mask': encoder_attention_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.model_tester = T5ModelTest.T5ModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
|
def test_t5_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_t5_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_with_lm_head(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/transformers_test/"
|
||||||
|
for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
model = T5Model.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -118,7 +118,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = AlbertConfig(
|
config = AlbertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import logging
|
|||||||
|
|
||||||
from transformers import is_tf_available
|
from transformers import is_tf_available
|
||||||
|
|
||||||
from .utils import require_tf, slow
|
from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
|
||||||
|
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
from transformers import (AutoConfig, BertConfig,
|
from transformers import (AutoConfig, BertConfig,
|
||||||
@@ -93,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase):
|
|||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
self.assertIsInstance(model, TFBertForQuestionAnswering)
|
self.assertIsInstance(model, TFBertForQuestionAnswering)
|
||||||
|
|
||||||
|
def test_from_pretrained_identifier(self):
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True)
|
||||||
|
self.assertIsInstance(model, TFBertForMaskedLM)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = BertConfig(
|
config = BertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ class TFCommonTestCases:
|
|||||||
test_torchscript = True
|
test_torchscript = True
|
||||||
test_pruning = True
|
test_pruning = True
|
||||||
test_resize_embeddings = True
|
test_resize_embeddings = True
|
||||||
|
is_encoder_decoder = False
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
pass
|
pass
|
||||||
@@ -129,8 +130,12 @@ class TFCommonTestCases:
|
|||||||
for name, key in inputs_dict.items())
|
for name, key in inputs_dict.items())
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
pto = pt_model(**pt_inputs_dict)
|
pto = pt_model(**pt_inputs_dict)
|
||||||
tfo = tf_model(inputs_dict)
|
tfo = tf_model(inputs_dict, training=False)
|
||||||
max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
|
tf_hidden_states = tfo[0].numpy()
|
||||||
|
pt_hidden_states = pto[0].numpy()
|
||||||
|
tf_hidden_states[np.isnan(tf_hidden_states)] = 0
|
||||||
|
pt_hidden_states[np.isnan(pt_hidden_states)] = 0
|
||||||
|
max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
|
||||||
self.assertLessEqual(max_diff, 2e-2)
|
self.assertLessEqual(max_diff, 2e-2)
|
||||||
|
|
||||||
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
|
# Check we can load pt model in tf and vice-versa with checkpoint => model functions
|
||||||
@@ -150,13 +155,21 @@ class TFCommonTestCases:
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
pto = pt_model(**pt_inputs_dict)
|
pto = pt_model(**pt_inputs_dict)
|
||||||
tfo = tf_model(inputs_dict)
|
tfo = tf_model(inputs_dict)
|
||||||
max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
|
tfo = tfo[0].numpy()
|
||||||
|
pto = pto[0].numpy()
|
||||||
|
tfo[np.isnan(tfo)] = 0
|
||||||
|
pto[np.isnan(pto)] = 0
|
||||||
|
max_diff = np.amax(np.abs(tfo - pto))
|
||||||
self.assertLessEqual(max_diff, 2e-2)
|
self.assertLessEqual(max_diff, 2e-2)
|
||||||
|
|
||||||
def test_compile_tf_model(self):
|
def test_compile_tf_model(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
|
if self.is_encoder_decoder:
|
||||||
|
input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
|
||||||
|
'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
|
||||||
|
else:
|
||||||
|
input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
|
||||||
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
|
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
|
||||||
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||||
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
||||||
@@ -189,7 +202,7 @@ class TFCommonTestCases:
|
|||||||
outputs_dict = model(inputs_dict)
|
outputs_dict = model(inputs_dict)
|
||||||
|
|
||||||
inputs_keywords = copy.deepcopy(inputs_dict)
|
inputs_keywords = copy.deepcopy(inputs_dict)
|
||||||
input_ids = inputs_keywords.pop('input_ids')
|
input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
|
||||||
outputs_keywords = model(input_ids, **inputs_keywords)
|
outputs_keywords = model(input_ids, **inputs_keywords)
|
||||||
|
|
||||||
output_dict = outputs_dict[0].numpy()
|
output_dict = outputs_dict[0].numpy()
|
||||||
@@ -200,6 +213,11 @@ class TFCommonTestCases:
|
|||||||
def test_attention_outputs(self):
|
def test_attention_outputs(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
|
||||||
|
encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
|
||||||
|
decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
|
||||||
|
encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = False
|
config.output_hidden_states = False
|
||||||
@@ -212,16 +230,28 @@ class TFCommonTestCases:
|
|||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
list(attentions[0].shape[-3:]),
|
list(attentions[0].shape[-3:]),
|
||||||
[self.model_tester.num_attention_heads,
|
[self.model_tester.num_attention_heads,
|
||||||
self.model_tester.seq_length,
|
encoder_seq_length,
|
||||||
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
encoder_key_length])
|
||||||
out_len = len(outputs)
|
out_len = len(outputs)
|
||||||
|
|
||||||
|
if self.is_encoder_decoder:
|
||||||
|
self.assertEqual(out_len % 2, 0)
|
||||||
|
decoder_attentions = outputs[(out_len // 2)-1]
|
||||||
|
self.assertEqual(model.config.output_attentions, True)
|
||||||
|
self.assertEqual(model.config.output_hidden_states, False)
|
||||||
|
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
|
||||||
|
self.assertListEqual(
|
||||||
|
list(decoder_attentions[0].shape[-3:]),
|
||||||
|
[self.model_tester.num_attention_heads,
|
||||||
|
decoder_seq_length,
|
||||||
|
decoder_key_length])
|
||||||
|
|
||||||
# Check attention is always last and order is fine
|
# Check attention is always last and order is fine
|
||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = True
|
config.output_hidden_states = True
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
outputs = model(inputs_dict)
|
outputs = model(inputs_dict)
|
||||||
self.assertEqual(out_len+1, len(outputs))
|
self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
|
||||||
self.assertEqual(model.config.output_attentions, True)
|
self.assertEqual(model.config.output_attentions, True)
|
||||||
self.assertEqual(model.config.output_hidden_states, True)
|
self.assertEqual(model.config.output_hidden_states, True)
|
||||||
|
|
||||||
@@ -230,8 +260,8 @@ class TFCommonTestCases:
|
|||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
list(attentions[0].shape[-3:]),
|
list(attentions[0].shape[-3:]),
|
||||||
[self.model_tester.num_attention_heads,
|
[self.model_tester.num_attention_heads,
|
||||||
self.model_tester.seq_length,
|
encoder_seq_length,
|
||||||
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
encoder_key_length])
|
||||||
|
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
@@ -264,35 +294,53 @@ class TFCommonTestCases:
|
|||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
|
first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
|
||||||
self.assertTrue(tf.math.equal(first, second).numpy().all())
|
out_1 = first.numpy()
|
||||||
|
out_2 = second.numpy()
|
||||||
|
out_1 = out_1[~np.isnan(out_1)]
|
||||||
|
out_2 = out_2[~np.isnan(out_2)]
|
||||||
|
max_diff = np.amax(np.abs(out_1 - out_2))
|
||||||
|
self.assertLessEqual(max_diff, 1e-5)
|
||||||
|
|
||||||
|
def _get_embeds(self, wte, input_ids):
|
||||||
|
# ^^ In our TF models, the input_embeddings can take slightly different forms,
|
||||||
|
# so we try a few of them.
|
||||||
|
# We used to fall back to just synthetically creating a dummy tensor of ones:
|
||||||
|
try:
|
||||||
|
x = wte(input_ids, mode="embedding")
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
x = wte([input_ids], mode="embedding")
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
x = wte([input_ids, None, None, None], mode="embedding")
|
||||||
|
except:
|
||||||
|
if hasattr(self.model_tester, "embedding_size"):
|
||||||
|
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
|
||||||
|
else:
|
||||||
|
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
|
||||||
|
return x
|
||||||
|
|
||||||
def test_inputs_embeds(self):
|
def test_inputs_embeds(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
input_ids = inputs_dict["input_ids"]
|
if not self.is_encoder_decoder:
|
||||||
del inputs_dict["input_ids"]
|
input_ids = inputs_dict["input_ids"]
|
||||||
|
del inputs_dict["input_ids"]
|
||||||
|
else:
|
||||||
|
encoder_input_ids = inputs_dict["encoder_input_ids"]
|
||||||
|
decoder_input_ids = inputs_dict["decoder_input_ids"]
|
||||||
|
del inputs_dict["encoder_input_ids"]
|
||||||
|
del inputs_dict["decoder_input_ids"]
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
model = model_class(config)
|
model = model_class(config)
|
||||||
|
|
||||||
wte = model.get_input_embeddings()
|
wte = model.get_input_embeddings()
|
||||||
try:
|
if not self.is_encoder_decoder:
|
||||||
x = wte(input_ids, mode="embedding")
|
inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
|
||||||
except:
|
else:
|
||||||
try:
|
inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
|
||||||
x = wte([input_ids], mode="embedding")
|
inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
|
||||||
except:
|
|
||||||
try:
|
|
||||||
x = wte([input_ids, None, None, None], mode="embedding")
|
|
||||||
except:
|
|
||||||
if hasattr(self.model_tester, "embedding_size"):
|
|
||||||
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
|
|
||||||
else:
|
|
||||||
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
|
|
||||||
# ^^ In our TF models, the input_embeddings can take slightly different forms,
|
|
||||||
# so we try a few of them.
|
|
||||||
# We used to fall back to just synthetically creating a dummy tensor of ones:
|
|
||||||
#
|
|
||||||
inputs_dict["inputs_embeds"] = x
|
|
||||||
outputs = model(inputs_dict)
|
outputs = model(inputs_dict)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = CTRLConfig(
|
config = CTRLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = DistilBertConfig(
|
config = DistilBertConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
dim=self.hidden_size,
|
dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
n_heads=self.num_attention_heads,
|
n_heads=self.num_attention_heads,
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = GPT2Config(
|
config = GPT2Config(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = OpenAIGPTConfig(
|
config = OpenAIGPTConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
config = RobertaConfig(
|
config = RobertaConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
num_attention_heads=self.num_attention_heads,
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
|||||||
172
transformers/tests/modeling_tf_t5_test.py
Normal file
172
transformers/tests/modeling_tf_t5_test.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
|
from transformers import T5Config, is_tf_available
|
||||||
|
|
||||||
|
if is_tf_available():
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
|
||||||
|
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
|
is_encoder_decoder = True
|
||||||
|
all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
|
||||||
|
|
||||||
|
class TFT5ModelTester(object):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_input_mask=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
n_positions=14,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
d_ff=37,
|
||||||
|
relative_attention_num_buckets=8,
|
||||||
|
dropout_rate=0.1,
|
||||||
|
initializer_factor=0.002,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.seq_length = seq_length
|
||||||
|
self.is_training = is_training
|
||||||
|
self.use_input_mask = use_input_mask
|
||||||
|
self.use_labels = use_labels
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.n_positions = n_positions
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.d_ff = d_ff
|
||||||
|
self.relative_attention_num_buckets = relative_attention_num_buckets
|
||||||
|
self.dropout_rate = dropout_rate
|
||||||
|
self.initializer_factor = initializer_factor
|
||||||
|
self.scope = scope
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
config = T5Config(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_positions=self.n_positions,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
d_ff=self.d_ff,
|
||||||
|
d_kv=self.hidden_size // self.num_attention_heads,
|
||||||
|
num_layers=self.num_hidden_layers,
|
||||||
|
num_heads=self.num_attention_heads,
|
||||||
|
relative_attention_num_buckets=self.relative_attention_num_buckets,
|
||||||
|
dropout_rate=self.dropout_rate,
|
||||||
|
initializer_factor=self.initializer_factor)
|
||||||
|
|
||||||
|
return (config, input_ids, input_mask, token_labels)
|
||||||
|
|
||||||
|
def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
|
||||||
|
model = TFT5Model(config=config)
|
||||||
|
inputs = {'encoder_input_ids': input_ids,
|
||||||
|
'decoder_input_ids': input_ids,
|
||||||
|
'decoder_attention_mask': input_mask}
|
||||||
|
encoder_output, decoder_output = model(inputs)
|
||||||
|
|
||||||
|
encoder_output, decoder_output = model(input_ids,
|
||||||
|
decoder_attention_mask=input_mask,
|
||||||
|
encoder_input_ids=input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"encoder_output": encoder_output.numpy(),
|
||||||
|
"decoder_output": decoder_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["encoder_output"].shape),
|
||||||
|
[self.batch_size, self.seq_length, self.hidden_size])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["decoder_output"].shape),
|
||||||
|
[self.batch_size, self.seq_length, self.hidden_size])
|
||||||
|
|
||||||
|
|
||||||
|
def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
|
||||||
|
model = TFT5WithLMHeadModel(config=config)
|
||||||
|
inputs = {'encoder_input_ids': input_ids,
|
||||||
|
'decoder_input_ids': input_ids,
|
||||||
|
'decoder_attention_mask': input_mask}
|
||||||
|
prediction_scores, decoder_output = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape),
|
||||||
|
[self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, input_mask, token_labels) = config_and_inputs
|
||||||
|
inputs_dict = {'encoder_input_ids': input_ids,
|
||||||
|
'decoder_input_ids': input_ids,
|
||||||
|
'decoder_attention_mask': input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
|
||||||
|
|
||||||
|
def test_config(self):
|
||||||
|
self.config_tester.run_common_tests()
|
||||||
|
|
||||||
|
def test_t5_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_t5_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_with_lm_head(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/transformers_test/"
|
||||||
|
for model_name in ['t5-small']:
|
||||||
|
model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -67,7 +67,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.seq_length = seq_length
|
self.seq_length = seq_length
|
||||||
self.mem_len = mem_len
|
self.mem_len = mem_len
|
||||||
self.key_len = seq_length + mem_len
|
self.key_length = seq_length + mem_len
|
||||||
self.clamp_len = clamp_len
|
self.clamp_len = clamp_len
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.use_labels = use_labels
|
self.use_labels = use_labels
|
||||||
@@ -92,7 +92,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
config = TransfoXLConfig(
|
config = TransfoXLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
cutoffs=self.cutoffs,
|
cutoffs=self.cutoffs,
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||||
|
|
||||||
config = XLMConfig(
|
config = XLMConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_special=self.n_special,
|
n_special=self.n_special,
|
||||||
emb_dim=self.hidden_size,
|
emb_dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
|
|||||||
@@ -64,7 +64,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
num_attention_heads=4,
|
num_attention_heads=4,
|
||||||
d_inner=128,
|
d_inner=128,
|
||||||
num_hidden_layers=5,
|
num_hidden_layers=5,
|
||||||
max_position_embeddings=10,
|
|
||||||
type_sequence_label_size=2,
|
type_sequence_label_size=2,
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
bi_data=False,
|
bi_data=False,
|
||||||
@@ -88,7 +87,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
self.d_inner = d_inner
|
self.d_inner = d_inner
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.bi_data = bi_data
|
self.bi_data = bi_data
|
||||||
self.untie_r = untie_r
|
self.untie_r = untie_r
|
||||||
self.same_length = same_length
|
self.same_length = same_length
|
||||||
@@ -122,13 +120,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||||
|
|
||||||
config = XLNetConfig(
|
config = XLNetConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
d_model=self.hidden_size,
|
d_model=self.hidden_size,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
d_inner=self.d_inner,
|
d_inner=self.d_inner,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
untie_r=self.untie_r,
|
untie_r=self.untie_r,
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
same_length=self.same_length,
|
same_length=self.same_length,
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.seq_length = seq_length
|
self.seq_length = seq_length
|
||||||
self.mem_len = mem_len
|
self.mem_len = mem_len
|
||||||
self.key_len = seq_length + mem_len
|
self.key_length = seq_length + mem_len
|
||||||
self.clamp_len = clamp_len
|
self.clamp_len = clamp_len
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
self.use_labels = use_labels
|
self.use_labels = use_labels
|
||||||
@@ -91,7 +91,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
|||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
config = TransfoXLConfig(
|
config = TransfoXLConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
cutoffs=self.cutoffs,
|
cutoffs=self.cutoffs,
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
|||||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||||
|
|
||||||
config = XLMConfig(
|
config = XLMConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
n_special=self.n_special,
|
n_special=self.n_special,
|
||||||
emb_dim=self.hidden_size,
|
emb_dim=self.hidden_size,
|
||||||
n_layers=self.num_hidden_layers,
|
n_layers=self.num_hidden_layers,
|
||||||
|
|||||||
@@ -60,7 +60,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
|||||||
num_attention_heads=4,
|
num_attention_heads=4,
|
||||||
d_inner=128,
|
d_inner=128,
|
||||||
num_hidden_layers=5,
|
num_hidden_layers=5,
|
||||||
max_position_embeddings=10,
|
|
||||||
type_sequence_label_size=2,
|
type_sequence_label_size=2,
|
||||||
untie_r=True,
|
untie_r=True,
|
||||||
bi_data=False,
|
bi_data=False,
|
||||||
@@ -84,7 +83,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
|||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
self.d_inner = d_inner
|
self.d_inner = d_inner
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.bi_data = bi_data
|
self.bi_data = bi_data
|
||||||
self.untie_r = untie_r
|
self.untie_r = untie_r
|
||||||
self.same_length = same_length
|
self.same_length = same_length
|
||||||
@@ -116,13 +114,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
|||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
config = XLNetConfig(
|
config = XLNetConfig(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
d_model=self.hidden_size,
|
d_model=self.hidden_size,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
d_inner=self.d_inner,
|
d_inner=self.d_inner,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
untie_r=self.untie_r,
|
untie_r=self.untie_r,
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
mem_len=self.mem_len,
|
mem_len=self.mem_len,
|
||||||
clamp_len=self.clamp_len,
|
clamp_len=self.clamp_len,
|
||||||
same_length=self.same_length,
|
same_length=self.same_length,
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import logging
|
|||||||
from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
|
from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
|
||||||
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
from .utils import slow
|
from .utils import slow, SMALL_MODEL_IDENTIFIER
|
||||||
|
|
||||||
|
|
||||||
class AutoTokenizerTest(unittest.TestCase):
|
class AutoTokenizerTest(unittest.TestCase):
|
||||||
@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
|
|||||||
self.assertIsInstance(tokenizer, GPT2Tokenizer)
|
self.assertIsInstance(tokenizer, GPT2Tokenizer)
|
||||||
self.assertGreater(len(tokenizer), 0)
|
self.assertGreater(len(tokenizer), 0)
|
||||||
|
|
||||||
|
def test_tokenizer_from_pretrained_identifier(self):
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
|
||||||
|
self.assertIsInstance(tokenizer, BertTokenizer)
|
||||||
|
self.assertEqual(len(tokenizer), 12)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
191
transformers/tests/tokenization_bert_japanese_test.py
Normal file
191
transformers/tests/tokenization_bert_japanese_test.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from transformers.tokenization_bert import WordpieceTokenizer
|
||||||
|
from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
|
||||||
|
MecabTokenizer, CharacterTokenizer,
|
||||||
|
VOCAB_FILES_NAMES)
|
||||||
|
|
||||||
|
from .tokenization_tests_commons import CommonTestCases
|
||||||
|
from .utils import slow, custom_tokenizers
|
||||||
|
|
||||||
|
|
||||||
|
@custom_tokenizers
|
||||||
|
class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||||
|
|
||||||
|
tokenizer_class = BertJapaneseTokenizer
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(BertJapaneseTokenizationTest, self).setUp()
|
||||||
|
|
||||||
|
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
|
||||||
|
u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
|
||||||
|
u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
|
||||||
|
|
||||||
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
|
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
|
def get_tokenizer(self, **kwargs):
|
||||||
|
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
|
def get_input_output_texts(self):
|
||||||
|
input_text = u"こんにちは、世界。 \nこんばんは、世界。"
|
||||||
|
output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
|
||||||
|
return input_text, output_text
|
||||||
|
|
||||||
|
def test_full_tokenizer(self):
|
||||||
|
tokenizer = self.tokenizer_class(self.vocab_file)
|
||||||
|
|
||||||
|
tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
|
||||||
|
self.assertListEqual(tokens,
|
||||||
|
[u"こんにちは", u"、", u"世界", u"。",
|
||||||
|
u"こん", u"##ばんは", u"、", u"世界", "。"])
|
||||||
|
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
|
||||||
|
[3, 12, 10, 14, 4, 9, 12, 10, 14])
|
||||||
|
|
||||||
|
def test_mecab_tokenizer(self):
|
||||||
|
tokenizer = MecabTokenizer()
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
|
[u"アップルストア", u"で", u"iPhone", u"8", u"が",
|
||||||
|
u"発売", u"さ", u"れ", u"た", u"。"])
|
||||||
|
|
||||||
|
def test_mecab_tokenizer_lower(self):
|
||||||
|
tokenizer = MecabTokenizer(do_lower_case=True)
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
|
[u"アップルストア", u"で", u"iphone", u"8", u"が",
|
||||||
|
u"発売", u"さ", u"れ", u"た", u"。"])
|
||||||
|
|
||||||
|
def test_mecab_tokenizer_no_normalize(self):
|
||||||
|
tokenizer = MecabTokenizer(normalize_text=False)
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||||
|
[u"アップルストア", u"で", u"iPhone", u"8", u"が",
|
||||||
|
u"発売", u"さ", u"れ", u"た", u" ", u"。"])
|
||||||
|
|
||||||
|
def test_wordpiece_tokenizer(self):
|
||||||
|
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
|
||||||
|
u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
|
||||||
|
|
||||||
|
vocab = {}
|
||||||
|
for (i, token) in enumerate(vocab_tokens):
|
||||||
|
vocab[token] = i
|
||||||
|
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
|
||||||
|
|
||||||
|
self.assertListEqual(tokenizer.tokenize(u""), [])
|
||||||
|
|
||||||
|
self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
|
||||||
|
[u"こんにちは"])
|
||||||
|
|
||||||
|
self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
|
||||||
|
[u"こん", u"##ばんは"])
|
||||||
|
|
||||||
|
self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
|
||||||
|
[u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_sequence_builders(self):
|
||||||
|
tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
|
||||||
|
|
||||||
|
text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
|
||||||
|
text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
|
||||||
|
|
||||||
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
|
# 2 is for "[CLS]", 3 is for "[SEP]"
|
||||||
|
assert encoded_sentence == [2] + text + [3]
|
||||||
|
assert encoded_pair == [2] + text + [3] + text_2 + [3]
|
||||||
|
|
||||||
|
|
||||||
|
class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||||
|
|
||||||
|
tokenizer_class = BertJapaneseTokenizer
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(BertJapaneseCharacterTokenizationTest, self).setUp()
|
||||||
|
|
||||||
|
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
|
||||||
|
u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
|
||||||
|
|
||||||
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
|
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
|
def get_tokenizer(self, **kwargs):
|
||||||
|
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
|
||||||
|
subword_tokenizer_type="character",
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
def get_input_output_texts(self):
|
||||||
|
input_text = u"こんにちは、世界。 \nこんばんは、世界。"
|
||||||
|
output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
|
||||||
|
return input_text, output_text
|
||||||
|
|
||||||
|
def test_full_tokenizer(self):
|
||||||
|
tokenizer = self.tokenizer_class(self.vocab_file,
|
||||||
|
subword_tokenizer_type="character")
|
||||||
|
|
||||||
|
tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
|
||||||
|
self.assertListEqual(tokens,
|
||||||
|
[u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
|
||||||
|
u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
|
||||||
|
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
|
||||||
|
[3, 4, 5, 6, 7, 11, 9, 10, 12,
|
||||||
|
3, 4, 8, 4, 7, 11, 9, 10, 12])
|
||||||
|
|
||||||
|
def test_character_tokenizer(self):
|
||||||
|
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
|
||||||
|
u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
|
||||||
|
|
||||||
|
vocab = {}
|
||||||
|
for (i, token) in enumerate(vocab_tokens):
|
||||||
|
vocab[token] = i
|
||||||
|
tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
|
||||||
|
|
||||||
|
self.assertListEqual(tokenizer.tokenize(u""), [])
|
||||||
|
|
||||||
|
self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
|
||||||
|
[u"こ", u"ん", u"に", u"ち", u"は"])
|
||||||
|
|
||||||
|
self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
|
||||||
|
[u"こ", u"ん", u"に", u"ち", u"[UNK]"])
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_sequence_builders(self):
|
||||||
|
tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
|
||||||
|
|
||||||
|
text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
|
||||||
|
text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
|
||||||
|
|
||||||
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
|
# 2 is for "[CLS]", 3 is for "[SEP]"
|
||||||
|
assert encoded_sentence == [2] + text + [3]
|
||||||
|
assert encoded_pair == [2] + text + [3] + text_2 + [3]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
assert encoded_sentence == [101] + text + [102]
|
assert encoded_sentence == [101] + text + [102]
|
||||||
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user