Merge branch 'master' into t5
This commit is contained in:
@@ -5,8 +5,12 @@ function deploy_doc(){
|
|||||||
git checkout $1
|
git checkout $1
|
||||||
if [ ! -z "$2" ]
|
if [ ! -z "$2" ]
|
||||||
then
|
then
|
||||||
echo "Pushing version" $2
|
if [ -d "$dir/$2" ]; then
|
||||||
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
|
echo "Directory" $2 "already exists"
|
||||||
|
else
|
||||||
|
echo "Pushing version" $2
|
||||||
|
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "Pushing master"
|
echo "Pushing master"
|
||||||
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
||||||
@@ -19,3 +23,4 @@ deploy_doc "fe02e45" v1.1.0
|
|||||||
deploy_doc "89fd345" v1.2.0
|
deploy_doc "89fd345" v1.2.0
|
||||||
deploy_doc "fc9faa8" v2.0.0
|
deploy_doc "fc9faa8" v2.0.0
|
||||||
deploy_doc "3ddce1d" v2.1.1
|
deploy_doc "3ddce1d" v2.1.1
|
||||||
|
deploy_doc "3616209" v2.2.0
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ assignees: ''
|
|||||||
|
|
||||||
* [ ] the model implementation is available: (give details)
|
* [ ] the model implementation is available: (give details)
|
||||||
* [ ] the model weights are available: (give details)
|
* [ ] the model weights are available: (give details)
|
||||||
|
* [ ] who are the authors: (mention them)
|
||||||
|
|
||||||
## Additional context
|
## Additional context
|
||||||
|
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -137,4 +137,5 @@ examples/runs
|
|||||||
serialization_dir
|
serialization_dir
|
||||||
|
|
||||||
# emacs
|
# emacs
|
||||||
*.*~
|
*.*~
|
||||||
|
debug.env
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ Follow these steps to start contributing:
|
|||||||
```bash
|
```bash
|
||||||
$ git clone git@github.com:<your Github handle>/transformers.git
|
$ git clone git@github.com:<your Github handle>/transformers.git
|
||||||
$ cd transformers
|
$ cd transformers
|
||||||
$ git remote add upstream git@github.com:huggingface/transformers.git
|
$ git remote add upstream https://github.com/huggingface/transformers.git
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Create a new branch to hold your development changes:
|
3. Create a new branch to hold your development changes:
|
||||||
|
|||||||
48
README.md
48
README.md
@@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime
|
|||||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||||
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
||||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||||
| [Documentation](https://huggingface.co/transformers/) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) | Full API documentation and more |
|
| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@@ -86,21 +86,41 @@ When TensorFlow 2.0 and/or PyTorch has been installed, you can install from sour
|
|||||||
pip install [--editable] .
|
pip install [--editable] .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Run the examples
|
||||||
|
|
||||||
|
Examples are included in the repository but are not shipped with the library.
|
||||||
|
Therefore, in order to run the latest versions of the examples you also need to install from source. To do so, create a new virtual environment and follow these steps:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers
|
||||||
|
cd transformers
|
||||||
|
pip install [--editable] .
|
||||||
|
```
|
||||||
|
|
||||||
### Tests
|
### Tests
|
||||||
|
|
||||||
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||||
|
|
||||||
These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
These tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
||||||
|
|
||||||
You can run the tests from the root of the cloned repository with the commands:
|
You can run the tests from the root of the cloned repository with the commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||||
|
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m pytest -sv ./transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||||
|
|
||||||
### Do you want to run a Transformer model on a mobile device?
|
### Do you want to run a Transformer model on a mobile device?
|
||||||
|
|
||||||
You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
|
You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
|
||||||
@@ -120,10 +140,12 @@ At some point in the future, you'll be able to seamlessly move from pre-training
|
|||||||
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
|
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
|
||||||
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
10. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||||
11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
|
12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
|
13. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||||
|
|
||||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||||
|
|
||||||
@@ -172,8 +194,7 @@ for model_class, tokenizer_class, pretrained_weights in MODELS:
|
|||||||
|
|
||||||
# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
|
# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
|
||||||
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
|
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
|
||||||
BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
|
BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]
|
||||||
BertForQuestionAnswering]
|
|
||||||
|
|
||||||
# All the classes for an architecture can be initiated from pretrained weights for this architecture
|
# All the classes for an architecture can be initiated from pretrained weights for this architecture
|
||||||
# Note that additional weights added for fine-tuning are only initialized
|
# Note that additional weights added for fine-tuning are only initialized
|
||||||
@@ -254,6 +275,11 @@ print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sen
|
|||||||
|
|
||||||
## Quick tour of the fine-tuning/usage scripts
|
## Quick tour of the fine-tuning/usage scripts
|
||||||
|
|
||||||
|
**Important**
|
||||||
|
Before running the fine-tuning scripts, please read the
|
||||||
|
[instructions](#run-the-examples) on how to
|
||||||
|
setup your environment to run the examples.
|
||||||
|
|
||||||
The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
|
The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
|
||||||
|
|
||||||
- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
|
- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
|
||||||
@@ -522,12 +548,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
|
|||||||
# Parameters:
|
# Parameters:
|
||||||
lr = 1e-3
|
lr = 1e-3
|
||||||
max_grad_norm = 1.0
|
max_grad_norm = 1.0
|
||||||
num_total_steps = 1000
|
num_training_steps = 1000
|
||||||
num_warmup_steps = 100
|
num_warmup_steps = 100
|
||||||
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
|
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
|
||||||
|
|
||||||
### Previously BertAdam optimizer was instantiated like this:
|
### Previously BertAdam optimizer was instantiated like this:
|
||||||
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
|
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps)
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
loss = model(batch)
|
loss = model(batch)
|
||||||
@@ -536,7 +562,7 @@ for batch in train_data:
|
|||||||
|
|
||||||
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
model.train()
|
model.train()
|
||||||
|
|||||||
22
deploy_multi_version_doc.sh
Normal file
22
deploy_multi_version_doc.sh
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
cd docs
|
||||||
|
|
||||||
|
function deploy_doc(){
|
||||||
|
echo "Creating doc at commit $1 and pushing to folder $2"
|
||||||
|
git checkout $1
|
||||||
|
if [ ! -z "$2" ]
|
||||||
|
then
|
||||||
|
echo "Pushing version" $2
|
||||||
|
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
|
||||||
|
else
|
||||||
|
echo "Pushing master"
|
||||||
|
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
deploy_doc "master"
|
||||||
|
deploy_doc "b33a385" v1.0.0
|
||||||
|
deploy_doc "fe02e45" v1.1.0
|
||||||
|
deploy_doc "89fd345" v1.2.0
|
||||||
|
deploy_doc "fc9faa8" v2.0.0
|
||||||
|
deploy_doc "3ddce1d" v2.1.1
|
||||||
|
deploy_doc "f2f3294" v2.2.0
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
function addIcon() {
|
function addIcon() {
|
||||||
const huggingFaceLogo = "https://huggingface.co/assets/transformers-docs/huggingface_logo.svg";
|
const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
|
||||||
const image = document.createElement("img");
|
const image = document.createElement("img");
|
||||||
image.setAttribute("src", huggingFaceLogo);
|
image.setAttribute("src", huggingFaceLogo);
|
||||||
|
|
||||||
@@ -24,10 +24,10 @@ function addCustomFooter() {
|
|||||||
social.classList.add("footer__Social");
|
social.classList.add("footer__Social");
|
||||||
|
|
||||||
const imageDetails = [
|
const imageDetails = [
|
||||||
{ link: "https://huggingface.co", imageLink: "https://huggingface.co/assets/transformers-docs/website.svg" },
|
{ link: "https://huggingface.co", imageLink: "https://huggingface.co/landing/assets/transformers-docs/website.svg" },
|
||||||
{ link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/twitter.svg" },
|
{ link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/twitter.svg" },
|
||||||
{ link: "https://github.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/github.svg" },
|
{ link: "https://github.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/github.svg" },
|
||||||
{ link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/assets/transformers-docs/linkedin.svg" }
|
{ link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/landing/assets/transformers-docs/linkedin.svg" }
|
||||||
];
|
];
|
||||||
|
|
||||||
imageDetails.forEach(imageLinks => {
|
imageDetails.forEach(imageLinks => {
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ author = u'huggingface'
|
|||||||
# The short X.Y version
|
# The short X.Y version
|
||||||
version = u''
|
version = u''
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = u'2.1.1'
|
release = u'2.2.1'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|||||||
@@ -47,6 +47,9 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||||||
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
||||||
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
|
8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
|
||||||
|
9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
|
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
||||||
|
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
@@ -89,3 +92,5 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||||||
model_doc/roberta
|
model_doc/roberta
|
||||||
model_doc/distilbert
|
model_doc/distilbert
|
||||||
model_doc/ctrl
|
model_doc/ctrl
|
||||||
|
model_doc/camembert
|
||||||
|
model_doc/albert
|
||||||
|
|||||||
@@ -24,15 +24,24 @@ pip install [--editable] .
|
|||||||
|
|
||||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||||
|
|
||||||
Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
|
Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||||
|
|
||||||
Run all the tests from the root of the cloned repository with the commands:
|
Run all the tests from the root of the cloned repository with the commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||||
|
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python -m pytest -sv ./transformers/tests/
|
python -m pytest -sv ./transformers/tests/
|
||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||||
|
|
||||||
## OpenAI GPT original tokenization workflow
|
## OpenAI GPT original tokenization workflow
|
||||||
|
|
||||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ The ``.optimization`` module provides:
|
|||||||
|
|
||||||
- an optimizer with weight decay fixed that can be used to fine-tuned models, and
|
- an optimizer with weight decay fixed that can be used to fine-tuned models, and
|
||||||
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
|
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
|
||||||
|
- a gradient accumulation class to accumulate the gradients of multiple batches
|
||||||
|
|
||||||
``AdamW``
|
``AdamW``
|
||||||
~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~
|
||||||
@@ -12,25 +13,32 @@ The ``.optimization`` module provides:
|
|||||||
.. autoclass:: transformers.AdamW
|
.. autoclass:: transformers.AdamW
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
``AdamWeightDecay``
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AdamWeightDecay
|
||||||
|
:members:
|
||||||
|
|
||||||
|
.. autofunction:: transformers.create_optimizer
|
||||||
|
:members:
|
||||||
|
|
||||||
Schedules
|
Schedules
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
Learning Rate Schedules
|
Learning Rate Schedules
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
.. autoclass:: transformers.ConstantLRSchedule
|
.. autofunction:: transformers.get_constant_schedule
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupConstantSchedule
|
.. autofunction:: transformers.get_constant_schedule_with_warmup
|
||||||
:members:
|
|
||||||
|
|
||||||
.. image:: /imgs/warmup_constant_schedule.png
|
.. image:: /imgs/warmup_constant_schedule.png
|
||||||
:target: /imgs/warmup_constant_schedule.png
|
:target: /imgs/warmup_constant_schedule.png
|
||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupCosineSchedule
|
.. autofunction:: transformers.get_cosine_schedule_with_warmup
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_schedule.png
|
.. image:: /imgs/warmup_cosine_schedule.png
|
||||||
@@ -38,8 +46,7 @@ Learning Rate Schedules
|
|||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
|
.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
|
||||||
:members:
|
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||||
:target: /imgs/warmup_cosine_hard_restarts_schedule.png
|
:target: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||||
@@ -47,9 +54,22 @@ Learning Rate Schedules
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupLinearSchedule
|
.. autofunction:: transformers.get_linear_schedule_with_warmup
|
||||||
:members:
|
|
||||||
|
|
||||||
.. image:: /imgs/warmup_linear_schedule.png
|
.. image:: /imgs/warmup_linear_schedule.png
|
||||||
:target: /imgs/warmup_linear_schedule.png
|
:target: /imgs/warmup_linear_schedule.png
|
||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
``Warmup``
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.Warmup
|
||||||
|
:members:
|
||||||
|
|
||||||
|
Gradient Strategies
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
``GradientAccumulator``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.GradientAccumulator
|
||||||
|
|||||||
@@ -54,5 +54,100 @@ Additionally, the following method can be used to load values from a data file
|
|||||||
Example usage
|
Example usage
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
||||||
|
|
||||||
|
|
||||||
|
XNLI
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
|
||||||
|
the quality of cross-lingual text representations.
|
||||||
|
XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment
|
||||||
|
annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
|
||||||
|
|
||||||
|
It was released together with the paper
|
||||||
|
`XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
|
||||||
|
|
||||||
|
This library hosts the processor to load the XNLI data:
|
||||||
|
- :class:`~transformers.data.processors.utils.XnliProcessor`
|
||||||
|
|
||||||
|
Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
|
||||||
|
|
||||||
An example using these processors is given in the
|
An example using these processors is given in the
|
||||||
`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
|
`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
|
||||||
|
|
||||||
|
|
||||||
|
SQuAD
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
|
||||||
|
the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
|
||||||
|
`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside
|
||||||
|
the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
|
||||||
|
|
||||||
|
This library hosts a processor for each of the two versions:
|
||||||
|
|
||||||
|
Processors
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Those processors are:
|
||||||
|
- :class:`~transformers.data.processors.utils.SquadV1Processor`
|
||||||
|
- :class:`~transformers.data.processors.utils.SquadV2Processor`
|
||||||
|
|
||||||
|
They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
|
||||||
|
|
||||||
|
.. autoclass:: transformers.data.processors.squad.SquadProcessor
|
||||||
|
:members:
|
||||||
|
|
||||||
|
Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
|
||||||
|
that can be used as model inputs.
|
||||||
|
|
||||||
|
.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
|
||||||
|
|
||||||
|
These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
|
||||||
|
Examples are given below.
|
||||||
|
|
||||||
|
|
||||||
|
Example usage
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
Here is an example using the processors as well as the conversion method using data files:
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
# Loading a V2 processor
|
||||||
|
processor = SquadV2Processor()
|
||||||
|
examples = processor.get_dev_examples(squad_v2_data_dir)
|
||||||
|
|
||||||
|
# Loading a V1 processor
|
||||||
|
processor = SquadV1Processor()
|
||||||
|
examples = processor.get_dev_examples(squad_v1_data_dir)
|
||||||
|
|
||||||
|
features = squad_convert_examples_to_features(
|
||||||
|
examples=examples,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
max_seq_length=max_seq_length,
|
||||||
|
doc_stride=args.doc_stride,
|
||||||
|
max_query_length=max_query_length,
|
||||||
|
is_training=not evaluate,
|
||||||
|
)
|
||||||
|
|
||||||
|
Using `tensorflow_datasets` is as easy as using a data file:
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
# tensorflow_datasets only handle Squad V1.
|
||||||
|
tfds_examples = tfds.load("squad")
|
||||||
|
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||||
|
|
||||||
|
features = squad_convert_examples_to_features(
|
||||||
|
examples=examples,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
max_seq_length=max_seq_length,
|
||||||
|
doc_stride=args.doc_stride,
|
||||||
|
max_query_length=max_query_length,
|
||||||
|
is_training=not evaluate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Another example using these processors is given in the
|
||||||
|
`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
|
||||||
|
|||||||
@@ -84,12 +84,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
|
|||||||
# Parameters:
|
# Parameters:
|
||||||
lr = 1e-3
|
lr = 1e-3
|
||||||
max_grad_norm = 1.0
|
max_grad_norm = 1.0
|
||||||
num_total_steps = 1000
|
num_training_steps = 1000
|
||||||
num_warmup_steps = 100
|
num_warmup_steps = 100
|
||||||
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
|
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
|
||||||
|
|
||||||
### Previously BertAdam optimizer was instantiated like this:
|
### Previously BertAdam optimizer was instantiated like this:
|
||||||
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
|
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
loss = model(batch)
|
loss = model(batch)
|
||||||
@@ -98,12 +98,12 @@ for batch in train_data:
|
|||||||
|
|
||||||
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
loss = model(batch)
|
loss = model(batch)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
||||||
scheduler.step()
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
scheduler.step()
|
||||||
```
|
```
|
||||||
|
|||||||
64
docs/source/model_doc/albert.rst
Normal file
64
docs/source/model_doc/albert.rst
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
ALBERT
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
``AlbrtConfig``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertConfig
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertTokenizer``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertTokenizer
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``AlbertForQuestionAnswering``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.AlbertForQuestionAnswering
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFAlbertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFAlbertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFAlbertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFAlbertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``TFAlbertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.TFAlbertForSequenceClassification
|
||||||
|
:members:
|
||||||
50
docs/source/model_doc/camembert.rst
Normal file
50
docs/source/model_doc/camembert.rst
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
CamemBERT
|
||||||
|
----------------------------------------------------
|
||||||
|
|
||||||
|
``CamembertConfig``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertConfig
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertTokenizer``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertTokenizer
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertModel``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertModel
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForMaskedLM``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForMaskedLM
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForSequenceClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForSequenceClassification
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForMultipleChoice``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForMultipleChoice
|
||||||
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
``CamembertForTokenClassification``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.CamembertForTokenClassification
|
||||||
|
:members:
|
||||||
@@ -73,6 +73,9 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| | ``gpt2-large`` | | 36-layer, 1280-hidden, 20-heads, 774M parameters. |
|
| | ``gpt2-large`` | | 36-layer, 1280-hidden, 20-heads, 774M parameters. |
|
||||||
| | | | OpenAI's Large-sized GPT-2 English model |
|
| | | | OpenAI's Large-sized GPT-2 English model |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``gpt2-xl`` | | 48-layer, 1600-hidden, 25-heads, 1558M parameters. |
|
||||||
|
| | | | OpenAI's XL-sized GPT-2 English model |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| Transformer-XL | ``transfo-xl-wt103`` | | 18-layer, 1024-hidden, 16-heads, 257M parameters. |
|
| Transformer-XL | ``transfo-xl-wt103`` | | 18-layer, 1024-hidden, 16-heads, 257M parameters. |
|
||||||
| | | | English model trained on wikitext-103 |
|
| | | | English model trained on wikitext-103 |
|
||||||
@@ -124,6 +127,14 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| | ``roberta-large-mnli`` | | 24-layer, 1024-hidden, 16-heads, 355M parameters |
|
| | ``roberta-large-mnli`` | | 24-layer, 1024-hidden, 16-heads, 355M parameters |
|
||||||
| | | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__. |
|
| | | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__. |
|
||||||
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__) |
|
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``roberta-base-openai-detector`` | | 12-layer, 768-hidden, 12-heads, 125M parameters |
|
||||||
|
| | | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. |
|
||||||
|
| | | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``roberta-large-openai-detector`` | | 24-layer, 1024-hidden, 16-heads, 355M parameters |
|
||||||
|
| | | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. |
|
||||||
|
| | | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| DistilBERT | ``distilbert-base-uncased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
| DistilBERT | ``distilbert-base-uncased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
||||||
@@ -140,10 +151,54 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||||
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
||||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||||
|
| | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. |
|
||||||
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``distilbert-base-multilingual-cased`` | | 6-layer, 768-hidden, 12-heads, 134M parameters |
|
||||||
|
| | | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint. |
|
||||||
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters |
|
| CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters |
|
||||||
| | | | Salesforce's Large-sized CTRL English model |
|
| | | | Salesforce's Large-sized CTRL English model |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| CamemBERT | ``camembert-base`` | | 12-layer, 768-hidden, 12-heads, 110M parameters |
|
||||||
|
| | | | CamemBERT using the BERT-base architecture |
|
||||||
|
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__) |
|
||||||
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||||
|
| | | | ALBERT base model |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||||
|
| | | | ALBERT large model |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||||
|
| | | | ALBERT xlarge model |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||||
|
| | | | ALBERT xxlarge model |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
|
||||||
|
| | | | ALBERT base model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
|
||||||
|
| | | | ALBERT large model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
|
||||||
|
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
|
||||||
|
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
||||||
|
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||||
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
| T5 | ``t5-small`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
| T5 | ``t5-small`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
||||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
@@ -165,4 +220,5 @@ Here is the full list of the currently provided pretrained models together with
|
|||||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
.. <https://huggingface.co/transformers/examples.html>`__
|
|
||||||
|
.. <https://huggingface.co/transformers/examples.html>`__
|
||||||
|
|||||||
@@ -188,3 +188,35 @@ assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
|
|||||||
```
|
```
|
||||||
|
|
||||||
Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
|
Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
|
||||||
|
|
||||||
|
#### Using the past
|
||||||
|
|
||||||
|
GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
|
||||||
|
|
||||||
|
Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||||
|
import torch
|
||||||
|
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
||||||
|
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||||
|
|
||||||
|
generated = tokenizer.encode("The Manhattan bridge")
|
||||||
|
context = torch.tensor([generated])
|
||||||
|
past = None
|
||||||
|
|
||||||
|
for i in range(100):
|
||||||
|
print(i)
|
||||||
|
output, past = model(context, past=past)
|
||||||
|
token = torch.argmax(output[0, :])
|
||||||
|
|
||||||
|
generated += [token.tolist()]
|
||||||
|
context = token.unsqueeze(0)
|
||||||
|
|
||||||
|
sequence = tokenizer.decode(generated)
|
||||||
|
|
||||||
|
print(sequence)
|
||||||
|
```
|
||||||
|
|
||||||
|
The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
|
||||||
@@ -106,7 +106,7 @@ This section explain how you can save and re-load a fine-tuned model (BERT, GPT,
|
|||||||
There are three types of files you need to save to be able to reload a fine-tuned model:
|
There are three types of files you need to save to be able to reload a fine-tuned model:
|
||||||
|
|
||||||
|
|
||||||
* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
|
* the model itself which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
|
||||||
* the configuration file of the model which is saved as a JSON file, and
|
* the configuration file of the model which is saved as a JSON file, and
|
||||||
* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
|
* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,17 @@
|
|||||||
In this section a few examples are put together. All of these examples work for several models, making use of the very
|
In this section a few examples are put together. All of these examples work for several models, making use of the very
|
||||||
similar API between the different models.
|
similar API between the different models.
|
||||||
|
|
||||||
|
**Important**
|
||||||
|
To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
|
||||||
|
Execute the following steps in a new virtual environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers
|
||||||
|
cd transformers
|
||||||
|
pip install [--editable] .
|
||||||
|
pip install -r ./examples/requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks.
|
| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks.
|
||||||
@@ -12,7 +23,9 @@ similar API between the different models.
|
|||||||
| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
|
| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
|
||||||
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
||||||
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
||||||
| [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. |
|
| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
|
||||||
|
| [Abstractive summarization](#abstractive-summarization) | Using the BertAbs
|
||||||
|
model finetuned on the CNN/DailyMail dataset to generate summaries. |
|
||||||
|
|
||||||
## TensorFlow 2.0 Bert models on GLUE
|
## TensorFlow 2.0 Bert models on GLUE
|
||||||
|
|
||||||
@@ -455,7 +468,8 @@ Training with the previously defined hyper-parameters yields the following resul
|
|||||||
|
|
||||||
## Named Entity Recognition
|
## Named Entity Recognition
|
||||||
|
|
||||||
Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py).
|
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
|
||||||
|
[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
|
||||||
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
||||||
Details and results for the fine-tuning provided by @stefan-it.
|
Details and results for the fine-tuning provided by @stefan-it.
|
||||||
|
|
||||||
@@ -500,7 +514,7 @@ The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so
|
|||||||
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### Training
|
### Prepare the run
|
||||||
|
|
||||||
Additional environment variables must be set:
|
Additional environment variables must be set:
|
||||||
|
|
||||||
@@ -512,6 +526,8 @@ export SAVE_STEPS=750
|
|||||||
export SEED=1
|
export SEED=1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Run the Pytorch version
|
||||||
|
|
||||||
To start training, just run:
|
To start training, just run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -532,7 +548,7 @@ python3 run_ner.py --data_dir ./ \
|
|||||||
|
|
||||||
If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
||||||
|
|
||||||
### Evaluation
|
#### Evaluation
|
||||||
|
|
||||||
Evaluation on development dataset outputs the following for our example:
|
Evaluation on development dataset outputs the following for our example:
|
||||||
|
|
||||||
@@ -554,6 +570,82 @@ On the test dataset the following results could be achieved:
|
|||||||
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
|
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
|
||||||
|
|
||||||
|
Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
|
||||||
|
|
||||||
|
| Model | F-Score Dev | F-Score Test
|
||||||
|
| --------------------------------- | ------- | --------
|
||||||
|
| `bert-large-cased` | 95.59 | 91.70
|
||||||
|
| `roberta-large` | 95.96 | 91.87
|
||||||
|
| `distilbert-base-uncased` | 94.34 | 90.32
|
||||||
|
|
||||||
|
### Run the Tensorflow 2 version
|
||||||
|
|
||||||
|
To start training, just run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 run_tf_ner.py --data_dir ./ \
|
||||||
|
--model_type bert \
|
||||||
|
--labels ./labels.txt \
|
||||||
|
--model_name_or_path $BERT_MODEL \
|
||||||
|
--output_dir $OUTPUT_DIR \
|
||||||
|
--max_seq_length $MAX_LENGTH \
|
||||||
|
--num_train_epochs $NUM_EPOCHS \
|
||||||
|
--per_device_train_batch_size $BATCH_SIZE \
|
||||||
|
--save_steps $SAVE_STEPS \
|
||||||
|
--seed $SEED \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_predict
|
||||||
|
```
|
||||||
|
|
||||||
|
Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
|
||||||
|
|
||||||
|
#### Evaluation
|
||||||
|
|
||||||
|
Evaluation on development dataset outputs the following for our example:
|
||||||
|
```bash
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
LOCderiv 0.7619 0.6154 0.6809 52
|
||||||
|
PERpart 0.8724 0.8997 0.8858 4057
|
||||||
|
OTHpart 0.9360 0.9466 0.9413 711
|
||||||
|
ORGpart 0.7015 0.6989 0.7002 269
|
||||||
|
LOCpart 0.7668 0.8488 0.8057 496
|
||||||
|
LOC 0.8745 0.9191 0.8963 235
|
||||||
|
ORGderiv 0.7723 0.8571 0.8125 91
|
||||||
|
OTHderiv 0.4800 0.6667 0.5581 18
|
||||||
|
OTH 0.5789 0.6875 0.6286 16
|
||||||
|
PERderiv 0.5385 0.3889 0.4516 18
|
||||||
|
PER 0.5000 0.5000 0.5000 2
|
||||||
|
ORG 0.0000 0.0000 0.0000 3
|
||||||
|
|
||||||
|
micro avg 0.8574 0.8862 0.8715 5968
|
||||||
|
macro avg 0.8575 0.8862 0.8713 5968
|
||||||
|
```
|
||||||
|
|
||||||
|
On the test dataset the following results could be achieved:
|
||||||
|
```bash
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
PERpart 0.8847 0.8944 0.8896 9397
|
||||||
|
OTHpart 0.9376 0.9353 0.9365 1639
|
||||||
|
ORGpart 0.7307 0.7044 0.7173 697
|
||||||
|
LOC 0.9133 0.9394 0.9262 561
|
||||||
|
LOCpart 0.8058 0.8157 0.8107 1150
|
||||||
|
ORG 0.0000 0.0000 0.0000 8
|
||||||
|
OTHderiv 0.5882 0.4762 0.5263 42
|
||||||
|
PERderiv 0.6571 0.5227 0.5823 44
|
||||||
|
OTH 0.4906 0.6667 0.5652 39
|
||||||
|
ORGderiv 0.7016 0.7791 0.7383 172
|
||||||
|
LOCderiv 0.8256 0.6514 0.7282 109
|
||||||
|
PER 0.0000 0.0000 0.0000 11
|
||||||
|
|
||||||
|
micro avg 0.8722 0.8774 0.8748 13869
|
||||||
|
macro avg 0.8712 0.8774 0.8740 13869
|
||||||
|
```
|
||||||
|
|
||||||
## Abstractive summarization
|
## Abstractive summarization
|
||||||
|
|
||||||
Based on the script
|
Based on the script
|
||||||
@@ -581,3 +673,43 @@ python run_summarization_finetuning.py \
|
|||||||
--do_train \
|
--do_train \
|
||||||
--data_path=$DATA_PATH \
|
--data_path=$DATA_PATH \
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## XNLI
|
||||||
|
|
||||||
|
Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
|
||||||
|
|
||||||
|
[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
|
||||||
|
|
||||||
|
#### Fine-tuning on XNLI
|
||||||
|
|
||||||
|
This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
|
||||||
|
on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a
|
||||||
|
`$XNLI_DIR` directory.
|
||||||
|
|
||||||
|
* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
|
||||||
|
* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export XNLI_DIR=/path/to/XNLI
|
||||||
|
|
||||||
|
python run_xnli.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-multilingual-cased \
|
||||||
|
--language de \
|
||||||
|
--train_language en \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--data_dir $XNLI_DIR \
|
||||||
|
--per_gpu_train_batch_size 32 \
|
||||||
|
--learning_rate 5e-5 \
|
||||||
|
--num_train_epochs 2.0 \
|
||||||
|
--max_seq_length 128 \
|
||||||
|
--output_dir /tmp/debug_xnli/ \
|
||||||
|
--save_steps -1
|
||||||
|
```
|
||||||
|
|
||||||
|
Training with the previously defined hyper-parameters yields the following results on the **test** set:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
acc = 0.7093812375249501
|
||||||
|
```
|
||||||
|
|||||||
48
examples/contrib/run_camembert.py
Normal file
48
examples/contrib/run_camembert.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import tarfile
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from transformers.tokenization_camembert import CamembertTokenizer
|
||||||
|
from transformers.modeling_camembert import CamembertForMaskedLM
|
||||||
|
|
||||||
|
|
||||||
|
def fill_mask(masked_input, model, tokenizer, topk=5):
|
||||||
|
# Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
|
||||||
|
assert masked_input.count('<mask>') == 1
|
||||||
|
input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
|
logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
|
||||||
|
logits = logits[0, masked_index, :]
|
||||||
|
prob = logits.softmax(dim=0)
|
||||||
|
values, indices = prob.topk(k=topk, dim=0)
|
||||||
|
topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
|
||||||
|
for i in range(len(indices))])
|
||||||
|
masked_token = tokenizer.mask_token
|
||||||
|
topk_filled_outputs = []
|
||||||
|
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
|
||||||
|
predicted_token = predicted_token_bpe.replace('\u2581', ' ')
|
||||||
|
if " {0}".format(masked_token) in masked_input:
|
||||||
|
topk_filled_outputs.append((
|
||||||
|
masked_input.replace(
|
||||||
|
' {0}'.format(masked_token), predicted_token
|
||||||
|
),
|
||||||
|
values[index].item(),
|
||||||
|
predicted_token,
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
topk_filled_outputs.append((
|
||||||
|
masked_input.replace(masked_token, predicted_token),
|
||||||
|
values[index].item(),
|
||||||
|
predicted_token,
|
||||||
|
))
|
||||||
|
return topk_filled_outputs
|
||||||
|
|
||||||
|
|
||||||
|
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||||
|
model = CamembertForMaskedLM.from_pretrained('camembert-base')
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
masked_input = "Le camembert est <mask> :)"
|
||||||
|
print(fill_mask(masked_input, model, tokenizer, topk=3))
|
||||||
@@ -41,7 +41,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
|||||||
|
|
||||||
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
||||||
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
||||||
WarmupLinearSchedule)
|
get_linear_schedule_with_warmup)
|
||||||
|
|
||||||
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
||||||
|
|
||||||
@@ -211,7 +211,7 @@ def main():
|
|||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
|
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
||||||
@@ -237,7 +237,7 @@ def main():
|
|||||||
# Save a trained model
|
# Save a trained model
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
# Save a trained model, configuration and tokenizer
|
# Save a trained model, configuration and tokenizer
|
||||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model itself
|
||||||
|
|
||||||
# If we save using the predefined names, we can load using `from_pretrained`
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ from tqdm import tqdm, trange
|
|||||||
from transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForMultipleChoice, BertTokenizer)
|
BertForMultipleChoice, BertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -322,7 +322,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -2,6 +2,10 @@
|
|||||||
|
|
||||||
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
|
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
|
||||||
|
|
||||||
|
**December 6th, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||||
|
|
||||||
|
**November 19th, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
|
||||||
|
|
||||||
**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
|
**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
|
||||||
|
|
||||||
**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
|
**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
|
||||||
@@ -15,8 +19,9 @@ Distil* is a class of compressed models that started with DistilBERT. DistilBERT
|
|||||||
|
|
||||||
We have applied the same method to other Transformer architectures and released the weights:
|
We have applied the same method to other Transformer architectures and released the weights:
|
||||||
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
|
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
|
||||||
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base` performance on GLUE while being twice faster and 35% smaller.
|
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
|
||||||
- and more to come! 🤗🤗🤗
|
- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
|
||||||
|
- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||||
|
|
||||||
For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
|
For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
|
||||||
|
|
||||||
@@ -27,7 +32,7 @@ Here are the results on the dev sets of GLUE:
|
|||||||
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
||||||
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
||||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||||
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
|
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
|
||||||
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
|
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
|
||||||
|
|
||||||
<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
|
<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
|
||||||
@@ -36,6 +41,14 @@ Here are the results on the dev sets of GLUE:
|
|||||||
|
|
||||||
<sup>3</sup> We compute this score ourselves for completeness.
|
<sup>3</sup> We compute this score ourselves for completeness.
|
||||||
|
|
||||||
|
Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
|
||||||
|
|
||||||
|
| Model | English | Spanish | Chinese | German | Arabic | Urdu |
|
||||||
|
| :---: | :---: | :---: | :---: | :---: | :---: | :---:|
|
||||||
|
| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 |
|
||||||
|
| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 |
|
||||||
|
| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 |
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||||
@@ -45,13 +58,14 @@ This part of the library has only be tested with Python3.6+. There are few speci
|
|||||||
|
|
||||||
## How to use DistilBERT
|
## How to use DistilBERT
|
||||||
|
|
||||||
Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
||||||
|
|
||||||
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
||||||
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
||||||
|
- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
|
||||||
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
|
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
|
||||||
- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
|
- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
|
||||||
- and more to come! 🤗🤗🤗
|
- `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.
|
||||||
|
|
||||||
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
||||||
|
|
||||||
@@ -67,6 +81,7 @@ last_hidden_states = outputs[0] # The last hidden-state is the first element of
|
|||||||
Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
|
Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
|
||||||
- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
|
- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
|
||||||
- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
|
- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
|
||||||
|
- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
|
||||||
|
|
||||||
|
|
||||||
## How to train Distil*
|
## How to train Distil*
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ import psutil
|
|||||||
import time
|
import time
|
||||||
from tqdm import trange, tqdm
|
from tqdm import trange, tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import psutil
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@@ -35,7 +34,7 @@ try:
|
|||||||
except:
|
except:
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
from transformers import WarmupLinearSchedule
|
from transformers import get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from utils import logger
|
from utils import logger
|
||||||
from lm_seqs_dataset import LmSeqsDataset
|
from lm_seqs_dataset import LmSeqsDataset
|
||||||
@@ -137,9 +136,9 @@ class Distiller:
|
|||||||
betas=(0.9, 0.98))
|
betas=(0.9, 0.98))
|
||||||
|
|
||||||
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
||||||
self.scheduler = WarmupLinearSchedule(self.optimizer,
|
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
|
||||||
warmup_steps=warmup_steps,
|
num_warmup_steps=warmup_steps,
|
||||||
t_total=num_train_optimization_steps)
|
num_training_steps=num_train_optimization_steps)
|
||||||
|
|
||||||
if self.fp16:
|
if self.fp16:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -3,4 +3,4 @@ tensorboard>=1.14.0
|
|||||||
tensorboardX==1.8
|
tensorboardX==1.8
|
||||||
psutil==5.6.3
|
psutil==5.6.3
|
||||||
scipy==1.3.1
|
scipy==1.3.1
|
||||||
transformers==2.0.0
|
transformers
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from ..utils_squad import (read_squad_examples, convert_examples_to_features,
|
from ..utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||||
RawResult, write_predictions,
|
RawResult, write_predictions,
|
||||||
@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
54
examples/pplm/README.md
Normal file
54
examples/pplm/README.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# Plug and Play Language Models: a Simple Approach to Controlled Text Generation
|
||||||
|
|
||||||
|
Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
|
||||||
|
|
||||||
|
This folder contains the original code used to run the Plug and Play Language Model (PPLM).
|
||||||
|
|
||||||
|
Paper link: https://arxiv.org/abs/1912.02164
|
||||||
|
|
||||||
|
Blog link: https://eng.uber.com/pplm
|
||||||
|
|
||||||
|
Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
|
||||||
|
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers && cd transformers
|
||||||
|
pip install [--editable] .
|
||||||
|
pip install nltk torchtext # additional requirements.
|
||||||
|
cd examples/pplm
|
||||||
|
```
|
||||||
|
|
||||||
|
## PPLM-BoW
|
||||||
|
|
||||||
|
### Example command for bag-of-words control
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tuning hyperparameters for bag-of-words control
|
||||||
|
|
||||||
|
1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
|
||||||
|
|
||||||
|
2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
|
||||||
|
a) Reduce the `--stepsize` </br>
|
||||||
|
b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
|
||||||
|
c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
|
||||||
|
|
||||||
|
|
||||||
|
## PPLM-Discrim
|
||||||
|
|
||||||
|
### Example command for discriminator based sentiment control
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tuning hyperparameters for discriminator control
|
||||||
|
|
||||||
|
1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
|
||||||
|
|
||||||
|
2. Use `--class_label 3` for negative, and `--class_label 2` for positive
|
||||||
|
|
||||||
BIN
examples/pplm/imgs/headfigure.png
Normal file
BIN
examples/pplm/imgs/headfigure.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 653 KiB |
BIN
examples/pplm/imgs/wooly.png
Normal file
BIN
examples/pplm/imgs/wooly.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 664 KiB |
18
examples/pplm/pplm_classification_head.py
Normal file
18
examples/pplm/pplm_classification_head.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
class ClassificationHead(torch.nn.Module):
|
||||||
|
"""Classification Head for transformer encoders"""
|
||||||
|
|
||||||
|
def __init__(self, class_size, embed_size):
|
||||||
|
super(ClassificationHead, self).__init__()
|
||||||
|
self.class_size = class_size
|
||||||
|
self.embed_size = embed_size
|
||||||
|
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
|
||||||
|
# self.mlp2 = (torch.nn.Linear(embed_size, class_size))
|
||||||
|
self.mlp = torch.nn.Linear(embed_size, class_size)
|
||||||
|
|
||||||
|
def forward(self, hidden_state):
|
||||||
|
# hidden_state = F.relu(self.mlp1(hidden_state))
|
||||||
|
# hidden_state = self.mlp2(hidden_state)
|
||||||
|
logits = self.mlp(hidden_state)
|
||||||
|
return logits
|
||||||
879
examples/pplm/run_pplm.py
Normal file
879
examples/pplm/run_pplm.py
Normal file
@@ -0,0 +1,879 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||||
|
#
|
||||||
|
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
#you may not use this file except in compliance with the License.
|
||||||
|
#You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
#http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
#Unless required by applicable law or agreed to in writing, software
|
||||||
|
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
#See the License for the specific language governing permissions and
|
||||||
|
#limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example command with bag of words:
|
||||||
|
python examples/run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.01 --window_length 5 --kl_scale 0.01 --gm_scale 0.95
|
||||||
|
|
||||||
|
Example command with discriminator:
|
||||||
|
python examples/run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length 10 --gamma 1.0 --num_iterations 30 --num_samples 10 --stepsize 0.01 --kl_scale 0.01 --gm_scale 0.95
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from operator import add
|
||||||
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.autograd import Variable
|
||||||
|
from tqdm import trange
|
||||||
|
|
||||||
|
from transformers import GPT2Tokenizer
|
||||||
|
from transformers.file_utils import cached_path
|
||||||
|
from transformers.modeling_gpt2 import GPT2LMHeadModel
|
||||||
|
from pplm_classification_head import ClassificationHead
|
||||||
|
|
||||||
|
PPLM_BOW = 1
|
||||||
|
PPLM_DISCRIM = 2
|
||||||
|
PPLM_BOW_DISCRIM = 3
|
||||||
|
SMALL_CONST = 1e-15
|
||||||
|
BIG_CONST = 1e10
|
||||||
|
|
||||||
|
BAG_OF_WORDS_ARCHIVE_MAP = {
|
||||||
|
'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
|
||||||
|
'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
|
||||||
|
'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
|
||||||
|
'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
|
||||||
|
'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
|
||||||
|
'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
|
||||||
|
'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
|
||||||
|
}
|
||||||
|
|
||||||
|
DISCRIMINATOR_MODELS_PARAMS = {
|
||||||
|
"clickbait": {
|
||||||
|
"url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/clickbait_classifier_head.pt",
|
||||||
|
"class_size": 2,
|
||||||
|
"embed_size": 1024,
|
||||||
|
"class_vocab": {"non_clickbait": 0, "clickbait": 1},
|
||||||
|
"default_class": 1,
|
||||||
|
"pretrained_model": "gpt2-medium",
|
||||||
|
},
|
||||||
|
"sentiment": {
|
||||||
|
"url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
|
||||||
|
"class_size": 5,
|
||||||
|
"embed_size": 1024,
|
||||||
|
"class_vocab": {"very_positive": 2, "very_negative": 3},
|
||||||
|
"default_class": 3,
|
||||||
|
"pretrained_model": "gpt2-medium",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def to_var(x, requires_grad=False, volatile=False, device='cuda'):
|
||||||
|
if torch.cuda.is_available() and device == 'cuda':
|
||||||
|
x = x.cuda()
|
||||||
|
elif device != 'cuda':
|
||||||
|
x = x.to(device)
|
||||||
|
return Variable(x, requires_grad=requires_grad, volatile=volatile)
|
||||||
|
|
||||||
|
|
||||||
|
def top_k_filter(logits, k, probs=False):
|
||||||
|
"""
|
||||||
|
Masks everything but the k top entries as -infinity (1e10).
|
||||||
|
Used to mask logits such that e^-infinity -> 0 won't contribute to the
|
||||||
|
sum of the denominator.
|
||||||
|
"""
|
||||||
|
if k == 0:
|
||||||
|
return logits
|
||||||
|
else:
|
||||||
|
values = torch.topk(logits, k)[0]
|
||||||
|
batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
|
||||||
|
if probs:
|
||||||
|
return torch.where(logits < batch_mins,
|
||||||
|
torch.ones_like(logits) * 0.0, logits)
|
||||||
|
return torch.where(logits < batch_mins,
|
||||||
|
torch.ones_like(logits) * -BIG_CONST,
|
||||||
|
logits)
|
||||||
|
|
||||||
|
|
||||||
|
def perturb_past(
|
||||||
|
past,
|
||||||
|
model,
|
||||||
|
last,
|
||||||
|
unpert_past=None,
|
||||||
|
unpert_logits=None,
|
||||||
|
accumulated_hidden=None,
|
||||||
|
grad_norms=None,
|
||||||
|
stepsize=0.01,
|
||||||
|
one_hot_bows_vectors=None,
|
||||||
|
classifier=None,
|
||||||
|
class_label=None,
|
||||||
|
loss_type=0,
|
||||||
|
num_iterations=3,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
kl_scale=0.01,
|
||||||
|
device='cuda',
|
||||||
|
):
|
||||||
|
# Generate inital perturbed past
|
||||||
|
grad_accumulator = [
|
||||||
|
(np.zeros(p.shape).astype("float32"))
|
||||||
|
for p in past
|
||||||
|
]
|
||||||
|
|
||||||
|
if accumulated_hidden is None:
|
||||||
|
accumulated_hidden = 0
|
||||||
|
|
||||||
|
if decay:
|
||||||
|
decay_mask = torch.arange(
|
||||||
|
0.,
|
||||||
|
1.0 + SMALL_CONST,
|
||||||
|
1.0 / (window_length)
|
||||||
|
)[1:]
|
||||||
|
else:
|
||||||
|
decay_mask = 1.0
|
||||||
|
|
||||||
|
# TODO fix this comment (SUMANTH)
|
||||||
|
# Generate a mask is gradient perturbated is based on a past window
|
||||||
|
_, _, _, curr_length, _ = past[0].shape
|
||||||
|
|
||||||
|
if curr_length > window_length and window_length > 0:
|
||||||
|
ones_key_val_shape = (
|
||||||
|
tuple(past[0].shape[:-2])
|
||||||
|
+ tuple([window_length])
|
||||||
|
+ tuple(past[0].shape[-1:])
|
||||||
|
)
|
||||||
|
|
||||||
|
zeros_key_val_shape = (
|
||||||
|
tuple(past[0].shape[:-2])
|
||||||
|
+ tuple([curr_length - window_length])
|
||||||
|
+ tuple(past[0].shape[-1:])
|
||||||
|
)
|
||||||
|
|
||||||
|
ones_mask = torch.ones(ones_key_val_shape)
|
||||||
|
ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
|
||||||
|
ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
|
||||||
|
|
||||||
|
window_mask = torch.cat(
|
||||||
|
(ones_mask, torch.zeros(zeros_key_val_shape)),
|
||||||
|
dim=-2
|
||||||
|
).to(device)
|
||||||
|
else:
|
||||||
|
window_mask = torch.ones_like(past[0]).to(device)
|
||||||
|
|
||||||
|
# accumulate perturbations for num_iterations
|
||||||
|
loss_per_iter = []
|
||||||
|
new_accumulated_hidden = None
|
||||||
|
for i in range(num_iterations):
|
||||||
|
print("Iteration ", i + 1)
|
||||||
|
curr_perturbation = [
|
||||||
|
to_var(torch.from_numpy(p_), requires_grad=True, device=device)
|
||||||
|
for p_ in grad_accumulator
|
||||||
|
]
|
||||||
|
|
||||||
|
# Compute hidden using perturbed past
|
||||||
|
perturbed_past = list(map(add, past, curr_perturbation))
|
||||||
|
_, _, _, curr_length, _ = curr_perturbation[0].shape
|
||||||
|
all_logits, _, all_hidden = model(last, past=perturbed_past)
|
||||||
|
hidden = all_hidden[-1]
|
||||||
|
new_accumulated_hidden = accumulated_hidden + torch.sum(
|
||||||
|
hidden,
|
||||||
|
dim=1
|
||||||
|
).detach()
|
||||||
|
# TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
|
||||||
|
logits = all_logits[:, -1, :]
|
||||||
|
probs = F.softmax(logits, dim=-1)
|
||||||
|
|
||||||
|
loss = 0.0
|
||||||
|
loss_list = []
|
||||||
|
if loss_type == PPLM_BOW or loss_type == PPLM_BOW_DISCRIM:
|
||||||
|
for one_hot_bow in one_hot_bows_vectors:
|
||||||
|
bow_logits = torch.mm(probs, torch.t(one_hot_bow))
|
||||||
|
bow_loss = -torch.log(torch.sum(bow_logits))
|
||||||
|
loss += bow_loss
|
||||||
|
loss_list.append(bow_loss)
|
||||||
|
print(" pplm_bow_loss:", loss.data.cpu().numpy())
|
||||||
|
|
||||||
|
if loss_type == 2 or loss_type == 3:
|
||||||
|
ce_loss = torch.nn.CrossEntropyLoss()
|
||||||
|
# TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
|
||||||
|
curr_unpert_past = unpert_past
|
||||||
|
curr_probs = torch.unsqueeze(probs, dim=1)
|
||||||
|
wte = model.resize_token_embeddings()
|
||||||
|
for _ in range(horizon_length):
|
||||||
|
inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
|
||||||
|
_, curr_unpert_past, curr_all_hidden = model(
|
||||||
|
past=curr_unpert_past,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
curr_hidden = curr_all_hidden[-1]
|
||||||
|
new_accumulated_hidden = new_accumulated_hidden + torch.sum(
|
||||||
|
curr_hidden, dim=1)
|
||||||
|
|
||||||
|
prediction = classifier(new_accumulated_hidden /
|
||||||
|
(curr_length + 1 + horizon_length))
|
||||||
|
|
||||||
|
label = torch.tensor(prediction.shape[0] * [class_label],
|
||||||
|
device=device,
|
||||||
|
dtype=torch.long)
|
||||||
|
discrim_loss = ce_loss(prediction, label)
|
||||||
|
print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
|
||||||
|
loss += discrim_loss
|
||||||
|
loss_list.append(discrim_loss)
|
||||||
|
|
||||||
|
kl_loss = 0.0
|
||||||
|
if kl_scale > 0.0:
|
||||||
|
unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
|
||||||
|
unpert_probs = (
|
||||||
|
unpert_probs + SMALL_CONST *
|
||||||
|
(unpert_probs <= SMALL_CONST).float().to(device).detach()
|
||||||
|
)
|
||||||
|
correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(
|
||||||
|
device).detach()
|
||||||
|
corrected_probs = probs + correction.detach()
|
||||||
|
kl_loss = kl_scale * (
|
||||||
|
(corrected_probs * (corrected_probs / unpert_probs).log()).sum()
|
||||||
|
)
|
||||||
|
print(' kl_loss', kl_loss.data.cpu().numpy())
|
||||||
|
loss += kl_loss
|
||||||
|
|
||||||
|
loss_per_iter.append(loss.data.cpu().numpy())
|
||||||
|
print(' pplm_loss', (loss - kl_loss).data.cpu().numpy())
|
||||||
|
|
||||||
|
# compute gradients
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
# calculate gradient norms
|
||||||
|
if grad_norms is not None and loss_type == PPLM_BOW:
|
||||||
|
grad_norms = [
|
||||||
|
torch.max(grad_norms[index], torch.norm(p_.grad * window_mask))
|
||||||
|
for index, p_ in enumerate(curr_perturbation)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
grad_norms = [
|
||||||
|
(torch.norm(p_.grad * window_mask) + SMALL_CONST)
|
||||||
|
for index, p_ in enumerate(curr_perturbation)
|
||||||
|
]
|
||||||
|
|
||||||
|
# normalize gradients
|
||||||
|
grad = [
|
||||||
|
-stepsize *
|
||||||
|
(p_.grad * window_mask / grad_norms[
|
||||||
|
index] ** gamma).data.cpu().numpy()
|
||||||
|
for index, p_ in enumerate(curr_perturbation)
|
||||||
|
]
|
||||||
|
|
||||||
|
# accumulate gradient
|
||||||
|
grad_accumulator = list(map(add, grad, grad_accumulator))
|
||||||
|
|
||||||
|
# reset gradients, just to make sure
|
||||||
|
for p_ in curr_perturbation:
|
||||||
|
p_.grad.data.zero_()
|
||||||
|
|
||||||
|
# removing past from the graph
|
||||||
|
new_past = []
|
||||||
|
for p_ in past:
|
||||||
|
new_past.append(p_.detach())
|
||||||
|
past = new_past
|
||||||
|
|
||||||
|
# apply the accumulated perturbations to the past
|
||||||
|
grad_accumulator = [
|
||||||
|
to_var(torch.from_numpy(p_), requires_grad=True, device=device)
|
||||||
|
for p_ in grad_accumulator
|
||||||
|
]
|
||||||
|
pert_past = list(map(add, past, grad_accumulator))
|
||||||
|
|
||||||
|
return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
|
||||||
|
|
||||||
|
|
||||||
|
def get_classifier(
|
||||||
|
name: Optional[str], class_label: Union[str, int],
|
||||||
|
device: str
|
||||||
|
) -> Tuple[Optional[ClassificationHead], Optional[int]]:
|
||||||
|
if name is None:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
params = DISCRIMINATOR_MODELS_PARAMS[name]
|
||||||
|
classifier = ClassificationHead(
|
||||||
|
class_size=params['class_size'],
|
||||||
|
embed_size=params['embed_size']
|
||||||
|
).to(device)
|
||||||
|
if "url" in params:
|
||||||
|
resolved_archive_file = cached_path(params["url"])
|
||||||
|
elif "path" in params:
|
||||||
|
resolved_archive_file = params["path"]
|
||||||
|
else:
|
||||||
|
raise ValueError("Either url or path have to be specified "
|
||||||
|
"in the discriminator model parameters")
|
||||||
|
classifier.load_state_dict(
|
||||||
|
torch.load(resolved_archive_file, map_location=device))
|
||||||
|
classifier.eval()
|
||||||
|
|
||||||
|
if isinstance(class_label, str):
|
||||||
|
if class_label in params["class_vocab"]:
|
||||||
|
label_id = params["class_vocab"][class_label]
|
||||||
|
else:
|
||||||
|
label_id = params["default_class"]
|
||||||
|
print("class_label {} not in class_vocab".format(class_label))
|
||||||
|
print("available values are: {}".format(params["class_vocab"]))
|
||||||
|
print("using default class {}".format(label_id))
|
||||||
|
|
||||||
|
elif isinstance(class_label, int):
|
||||||
|
if class_label in set(params["class_vocab"].values()):
|
||||||
|
label_id = class_label
|
||||||
|
else:
|
||||||
|
label_id = params["default_class"]
|
||||||
|
print("class_label {} not in class_vocab".format(class_label))
|
||||||
|
print("available values are: {}".format(params["class_vocab"]))
|
||||||
|
print("using default class {}".format(label_id))
|
||||||
|
|
||||||
|
else:
|
||||||
|
label_id = params["default_class"]
|
||||||
|
|
||||||
|
return classifier, label_id
|
||||||
|
|
||||||
|
|
||||||
|
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
|
||||||
|
List[List[List[int]]]:
|
||||||
|
bow_indices = []
|
||||||
|
for id_or_path in bag_of_words_ids_or_paths:
|
||||||
|
if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
|
||||||
|
filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
|
||||||
|
else:
|
||||||
|
filepath = id_or_path
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
words = f.read().strip().split("\n")
|
||||||
|
bow_indices.append(
|
||||||
|
[tokenizer.encode(word.strip(), add_prefix_space=True) for word in
|
||||||
|
words])
|
||||||
|
return bow_indices
|
||||||
|
|
||||||
|
|
||||||
|
def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
|
||||||
|
if bow_indices is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
one_hot_bows_vectors = []
|
||||||
|
for single_bow in bow_indices:
|
||||||
|
single_bow = list(filter(lambda x: len(x) <= 1, single_bow))
|
||||||
|
single_bow = torch.tensor(single_bow).to(device)
|
||||||
|
num_words = single_bow.shape[0]
|
||||||
|
one_hot_bow = torch.zeros(num_words, tokenizer.vocab_size).to(device)
|
||||||
|
one_hot_bow.scatter_(1, single_bow, 1)
|
||||||
|
one_hot_bows_vectors.append(one_hot_bow)
|
||||||
|
return one_hot_bows_vectors
|
||||||
|
|
||||||
|
|
||||||
|
def full_text_generation(
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
context=None,
|
||||||
|
num_samples=1,
|
||||||
|
device="cuda",
|
||||||
|
bag_of_words=None,
|
||||||
|
discrim=None,
|
||||||
|
class_label=None,
|
||||||
|
length=100,
|
||||||
|
stepsize=0.02,
|
||||||
|
temperature=1.0,
|
||||||
|
top_k=10,
|
||||||
|
sample=False,
|
||||||
|
num_iterations=3,
|
||||||
|
grad_length=10000,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
gm_scale=0.9,
|
||||||
|
kl_scale=0.01,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
classifier, class_id = get_classifier(
|
||||||
|
discrim,
|
||||||
|
class_label,
|
||||||
|
device
|
||||||
|
)
|
||||||
|
|
||||||
|
bow_indices = []
|
||||||
|
if bag_of_words:
|
||||||
|
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
|
||||||
|
tokenizer)
|
||||||
|
|
||||||
|
if bag_of_words and classifier:
|
||||||
|
print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
|
||||||
|
loss_type = PPLM_BOW_DISCRIM
|
||||||
|
|
||||||
|
elif bag_of_words:
|
||||||
|
loss_type = PPLM_BOW
|
||||||
|
print("Using PPLM-BoW")
|
||||||
|
|
||||||
|
elif classifier is not None:
|
||||||
|
loss_type = PPLM_DISCRIM
|
||||||
|
print("Using PPLM-Discrim")
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception("Specify either a bag of words or a discriminator")
|
||||||
|
|
||||||
|
unpert_gen_tok_text, _, _ = generate_text_pplm(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
context=context,
|
||||||
|
device=device,
|
||||||
|
length=length,
|
||||||
|
sample=sample,
|
||||||
|
perturb=False
|
||||||
|
)
|
||||||
|
if device == 'cuda':
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
pert_gen_tok_texts = []
|
||||||
|
discrim_losses = []
|
||||||
|
losses_in_time = []
|
||||||
|
|
||||||
|
for i in range(num_samples):
|
||||||
|
pert_gen_tok_text, discrim_loss, loss_in_time = generate_text_pplm(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
context=context,
|
||||||
|
device=device,
|
||||||
|
perturb=True,
|
||||||
|
bow_indices=bow_indices,
|
||||||
|
classifier=classifier,
|
||||||
|
class_label=class_id,
|
||||||
|
loss_type=loss_type,
|
||||||
|
length=length,
|
||||||
|
stepsize=stepsize,
|
||||||
|
temperature=temperature,
|
||||||
|
top_k=top_k,
|
||||||
|
sample=sample,
|
||||||
|
num_iterations=num_iterations,
|
||||||
|
grad_length=grad_length,
|
||||||
|
horizon_length=horizon_length,
|
||||||
|
window_length=window_length,
|
||||||
|
decay=decay,
|
||||||
|
gamma=gamma,
|
||||||
|
gm_scale=gm_scale,
|
||||||
|
kl_scale=kl_scale,
|
||||||
|
)
|
||||||
|
pert_gen_tok_texts.append(pert_gen_tok_text)
|
||||||
|
if classifier is not None:
|
||||||
|
discrim_losses.append(discrim_loss.data.cpu().numpy())
|
||||||
|
losses_in_time.append(loss_in_time)
|
||||||
|
|
||||||
|
if device == 'cuda':
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
|
||||||
|
|
||||||
|
|
||||||
|
def generate_text_pplm(
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
context=None,
|
||||||
|
past=None,
|
||||||
|
device="cuda",
|
||||||
|
perturb=True,
|
||||||
|
bow_indices=None,
|
||||||
|
classifier=None,
|
||||||
|
class_label=None,
|
||||||
|
loss_type=0,
|
||||||
|
length=100,
|
||||||
|
stepsize=0.02,
|
||||||
|
temperature=1.0,
|
||||||
|
top_k=10,
|
||||||
|
sample=False,
|
||||||
|
num_iterations=3,
|
||||||
|
grad_length=10000,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
gm_scale=0.9,
|
||||||
|
kl_scale=0.01,
|
||||||
|
):
|
||||||
|
output_so_far = None
|
||||||
|
if context:
|
||||||
|
context_t = torch.tensor(context, device=device, dtype=torch.long)
|
||||||
|
while len(context_t.shape) < 2:
|
||||||
|
context_t = context_t.unsqueeze(0)
|
||||||
|
output_so_far = context_t
|
||||||
|
|
||||||
|
# collect one hot vectors for bags of words
|
||||||
|
one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer,
|
||||||
|
device)
|
||||||
|
|
||||||
|
grad_norms = None
|
||||||
|
last = None
|
||||||
|
unpert_discrim_loss = 0
|
||||||
|
loss_in_time = []
|
||||||
|
for i in trange(length, ascii=True):
|
||||||
|
|
||||||
|
# Get past/probs for current output, except for last word
|
||||||
|
# Note that GPT takes 2 inputs: past + current_token
|
||||||
|
|
||||||
|
# run model forward to obtain unperturbed
|
||||||
|
if past is None and output_so_far is not None:
|
||||||
|
last = output_so_far[:, -1:]
|
||||||
|
if output_so_far.shape[1] > 1:
|
||||||
|
_, past, _ = model(output_so_far[:, :-1])
|
||||||
|
|
||||||
|
unpert_logits, unpert_past, unpert_all_hidden = model(output_so_far)
|
||||||
|
unpert_last_hidden = unpert_all_hidden[-1]
|
||||||
|
|
||||||
|
# check if we are abowe grad max length
|
||||||
|
if i >= grad_length:
|
||||||
|
current_stepsize = stepsize * 0
|
||||||
|
else:
|
||||||
|
current_stepsize = stepsize
|
||||||
|
|
||||||
|
# modify the past if necessary
|
||||||
|
if not perturb or num_iterations == 0:
|
||||||
|
pert_past = past
|
||||||
|
|
||||||
|
else:
|
||||||
|
accumulated_hidden = unpert_last_hidden[:, :-1, :]
|
||||||
|
accumulated_hidden = torch.sum(accumulated_hidden, dim=1)
|
||||||
|
|
||||||
|
if past is not None:
|
||||||
|
pert_past, _, grad_norms, loss_this_iter = perturb_past(
|
||||||
|
past,
|
||||||
|
model,
|
||||||
|
last,
|
||||||
|
unpert_past=unpert_past,
|
||||||
|
unpert_logits=unpert_logits,
|
||||||
|
accumulated_hidden=accumulated_hidden,
|
||||||
|
grad_norms=grad_norms,
|
||||||
|
stepsize=current_stepsize,
|
||||||
|
one_hot_bows_vectors=one_hot_bows_vectors,
|
||||||
|
classifier=classifier,
|
||||||
|
class_label=class_label,
|
||||||
|
loss_type=loss_type,
|
||||||
|
num_iterations=num_iterations,
|
||||||
|
horizon_length=horizon_length,
|
||||||
|
window_length=window_length,
|
||||||
|
decay=decay,
|
||||||
|
gamma=gamma,
|
||||||
|
kl_scale=kl_scale,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
loss_in_time.append(loss_this_iter)
|
||||||
|
else:
|
||||||
|
pert_past = past
|
||||||
|
|
||||||
|
pert_logits, past, pert_all_hidden = model(last, past=pert_past)
|
||||||
|
pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST
|
||||||
|
pert_probs = F.softmax(pert_logits, dim=-1)
|
||||||
|
|
||||||
|
if classifier is not None:
|
||||||
|
ce_loss = torch.nn.CrossEntropyLoss()
|
||||||
|
prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
|
||||||
|
label = torch.tensor([class_label], device=device,
|
||||||
|
dtype=torch.long)
|
||||||
|
unpert_discrim_loss = ce_loss(prediction, label)
|
||||||
|
print(
|
||||||
|
"unperturbed discrim loss",
|
||||||
|
unpert_discrim_loss.data.cpu().numpy()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
unpert_discrim_loss = 0
|
||||||
|
|
||||||
|
# Fuse the modified model and original model
|
||||||
|
if perturb:
|
||||||
|
|
||||||
|
unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
|
||||||
|
|
||||||
|
pert_probs = ((pert_probs ** gm_scale) * (
|
||||||
|
unpert_probs ** (1 - gm_scale))) # + SMALL_CONST
|
||||||
|
pert_probs = top_k_filter(pert_probs, k=top_k,
|
||||||
|
probs=True) # + SMALL_CONST
|
||||||
|
|
||||||
|
# rescale
|
||||||
|
if torch.sum(pert_probs) <= 1:
|
||||||
|
pert_probs = pert_probs / torch.sum(pert_probs)
|
||||||
|
|
||||||
|
else:
|
||||||
|
pert_logits = top_k_filter(pert_logits, k=top_k) # + SMALL_CONST
|
||||||
|
pert_probs = F.softmax(pert_logits, dim=-1)
|
||||||
|
|
||||||
|
# sample or greedy
|
||||||
|
if sample:
|
||||||
|
last = torch.multinomial(pert_probs, num_samples=1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
_, last = torch.topk(pert_probs, k=1, dim=-1)
|
||||||
|
|
||||||
|
# update context/output_so_far appending the new token
|
||||||
|
output_so_far = (
|
||||||
|
last if output_so_far is None
|
||||||
|
else torch.cat((output_so_far, last), dim=1)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(tokenizer.decode(output_so_far.tolist()[0]))
|
||||||
|
|
||||||
|
return output_so_far, unpert_discrim_loss, loss_in_time
|
||||||
|
|
||||||
|
|
||||||
|
def set_generic_model_params(discrim_weights, discrim_meta):
|
||||||
|
if discrim_weights is None:
|
||||||
|
raise ValueError('When using a generic discriminator, '
|
||||||
|
'discrim_weights need to be specified')
|
||||||
|
if discrim_meta is None:
|
||||||
|
raise ValueError('When using a generic discriminator, '
|
||||||
|
'discrim_meta need to be specified')
|
||||||
|
|
||||||
|
with open(discrim_meta, 'r') as discrim_meta_file:
|
||||||
|
meta = json.load(discrim_meta_file)
|
||||||
|
meta['path'] = discrim_weights
|
||||||
|
DISCRIMINATOR_MODELS_PARAMS['generic'] = meta
|
||||||
|
|
||||||
|
|
||||||
|
def run_pplm_example(
|
||||||
|
pretrained_model="gpt2-medium",
|
||||||
|
cond_text="",
|
||||||
|
uncond=False,
|
||||||
|
num_samples=1,
|
||||||
|
bag_of_words=None,
|
||||||
|
discrim=None,
|
||||||
|
discrim_weights=None,
|
||||||
|
discrim_meta=None,
|
||||||
|
class_label=-1,
|
||||||
|
length=100,
|
||||||
|
stepsize=0.02,
|
||||||
|
temperature=1.0,
|
||||||
|
top_k=10,
|
||||||
|
sample=False,
|
||||||
|
num_iterations=3,
|
||||||
|
grad_length=10000,
|
||||||
|
horizon_length=1,
|
||||||
|
window_length=0,
|
||||||
|
decay=False,
|
||||||
|
gamma=1.5,
|
||||||
|
gm_scale=0.9,
|
||||||
|
kl_scale=0.01,
|
||||||
|
seed=0,
|
||||||
|
no_cuda=False,
|
||||||
|
colorama=False
|
||||||
|
):
|
||||||
|
# set Random seed
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
np.random.seed(seed)
|
||||||
|
|
||||||
|
# set the device
|
||||||
|
device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
|
||||||
|
|
||||||
|
if discrim == 'generic':
|
||||||
|
set_generic_model_params(discrim_weights, discrim_meta)
|
||||||
|
|
||||||
|
if discrim is not None:
|
||||||
|
pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
|
||||||
|
"pretrained_model"
|
||||||
|
]
|
||||||
|
print("discrim = {}, pretrained_model set "
|
||||||
|
"to discriminator's = {}".format(discrim, pretrained_model))
|
||||||
|
|
||||||
|
# load pretrained model
|
||||||
|
model = GPT2LMHeadModel.from_pretrained(
|
||||||
|
pretrained_model,
|
||||||
|
output_hidden_states=True
|
||||||
|
)
|
||||||
|
model.to(device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# load tokenizer
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
||||||
|
|
||||||
|
# Freeze GPT-2 weights
|
||||||
|
for param in model.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
# figure out conditioning text
|
||||||
|
if uncond:
|
||||||
|
tokenized_cond_text = tokenizer.encode(
|
||||||
|
[tokenizer.bos_token]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_text = cond_text
|
||||||
|
while not raw_text:
|
||||||
|
print("Did you forget to add `--cond_text`? ")
|
||||||
|
raw_text = input("Model prompt >>> ")
|
||||||
|
tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)
|
||||||
|
|
||||||
|
print("= Prefix of sentence =")
|
||||||
|
print(tokenizer.decode(tokenized_cond_text))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# generate unperturbed and perturbed texts
|
||||||
|
|
||||||
|
# full_text_generation returns:
|
||||||
|
# unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
|
||||||
|
unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
context=tokenized_cond_text,
|
||||||
|
device=device,
|
||||||
|
num_samples=num_samples,
|
||||||
|
bag_of_words=bag_of_words,
|
||||||
|
discrim=discrim,
|
||||||
|
class_label=class_label,
|
||||||
|
length=length,
|
||||||
|
stepsize=stepsize,
|
||||||
|
temperature=temperature,
|
||||||
|
top_k=top_k,
|
||||||
|
sample=sample,
|
||||||
|
num_iterations=num_iterations,
|
||||||
|
grad_length=grad_length,
|
||||||
|
horizon_length=horizon_length,
|
||||||
|
window_length=window_length,
|
||||||
|
decay=decay,
|
||||||
|
gamma=gamma,
|
||||||
|
gm_scale=gm_scale,
|
||||||
|
kl_scale=kl_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
# untokenize unperturbed text
|
||||||
|
unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
print("= Unperturbed generated text =")
|
||||||
|
print(unpert_gen_text)
|
||||||
|
print()
|
||||||
|
|
||||||
|
generated_texts = []
|
||||||
|
|
||||||
|
bow_word_ids = set()
|
||||||
|
if bag_of_words and colorama:
|
||||||
|
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
|
||||||
|
tokenizer)
|
||||||
|
for single_bow_list in bow_indices:
|
||||||
|
# filtering all words in the list composed of more than 1 token
|
||||||
|
filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
|
||||||
|
# w[0] because we are sure w has only 1 item because previous fitler
|
||||||
|
bow_word_ids.update(w[0] for w in filtered)
|
||||||
|
|
||||||
|
# iterate through the perturbed texts
|
||||||
|
for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
|
||||||
|
try:
|
||||||
|
# untokenize unperturbed text
|
||||||
|
if colorama:
|
||||||
|
import colorama
|
||||||
|
|
||||||
|
pert_gen_text = ''
|
||||||
|
for word_id in pert_gen_tok_text.tolist()[0]:
|
||||||
|
if word_id in bow_word_ids:
|
||||||
|
pert_gen_text += '{}{}{}'.format(
|
||||||
|
colorama.Fore.RED,
|
||||||
|
tokenizer.decode([word_id]),
|
||||||
|
colorama.Style.RESET_ALL
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pert_gen_text += tokenizer.decode([word_id])
|
||||||
|
else:
|
||||||
|
pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])
|
||||||
|
|
||||||
|
print("= Perturbed generated text {} =".format(i + 1))
|
||||||
|
print(pert_gen_text)
|
||||||
|
print()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# keep the prefix, perturbed seq, original seq for each index
|
||||||
|
generated_texts.append(
|
||||||
|
(tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--pretrained_model",
|
||||||
|
"-M",
|
||||||
|
type=str,
|
||||||
|
default="gpt2-medium",
|
||||||
|
help="pretrained model name or path to local checkpoint",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cond_text", type=str, default="The lake",
|
||||||
|
help="Prefix texts to condition on"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--uncond", action="store_true",
|
||||||
|
help="Generate from end-of-text as prefix"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num_samples",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of samples to generate from the modified latents",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--bag_of_words",
|
||||||
|
"-B",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Bags of words used for PPLM-BoW. "
|
||||||
|
"Either a BOW id (see list in code) or a filepath. "
|
||||||
|
"Multiple BoWs separated by ;",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--discrim",
|
||||||
|
"-D",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
choices=("clickbait", "sentiment", "toxicity", "generic"),
|
||||||
|
help="Discriminator to use",
|
||||||
|
)
|
||||||
|
parser.add_argument('--discrim_weights', type=str, default=None,
|
||||||
|
help='Weights for the generic discriminator')
|
||||||
|
parser.add_argument('--discrim_meta', type=str, default=None,
|
||||||
|
help='Meta information for the generic discriminator')
|
||||||
|
parser.add_argument(
|
||||||
|
"--class_label",
|
||||||
|
type=int,
|
||||||
|
default=-1,
|
||||||
|
help="Class label used for the discriminator",
|
||||||
|
)
|
||||||
|
parser.add_argument("--length", type=int, default=100)
|
||||||
|
parser.add_argument("--stepsize", type=float, default=0.02)
|
||||||
|
parser.add_argument("--temperature", type=float, default=1.0)
|
||||||
|
parser.add_argument("--top_k", type=int, default=10)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sample", action="store_true",
|
||||||
|
help="Generate from end-of-text as prefix"
|
||||||
|
)
|
||||||
|
parser.add_argument("--num_iterations", type=int, default=3)
|
||||||
|
parser.add_argument("--grad_length", type=int, default=10000)
|
||||||
|
parser.add_argument(
|
||||||
|
"--window_length",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Length of past which is being optimized; "
|
||||||
|
"0 corresponds to infinite window length",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--horizon_length",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Length of future to optimize over",
|
||||||
|
)
|
||||||
|
parser.add_argument("--decay", action="store_true",
|
||||||
|
help="whether to decay or not")
|
||||||
|
parser.add_argument("--gamma", type=float, default=1.5)
|
||||||
|
parser.add_argument("--gm_scale", type=float, default=0.9)
|
||||||
|
parser.add_argument("--kl_scale", type=float, default=0.01)
|
||||||
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument("--no_cuda", action="store_true", help="no cuda")
|
||||||
|
parser.add_argument("--colorama", action="store_true",
|
||||||
|
help="colors keywords")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
run_pplm_example(**vars(args))
|
||||||
588
examples/pplm/run_pplm_discrim_train.py
Normal file
588
examples/pplm/run_pplm_discrim_train.py
Normal file
@@ -0,0 +1,588 @@
|
|||||||
|
#! /usr/bin/env python3
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||||
|
#
|
||||||
|
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
#you may not use this file except in compliance with the License.
|
||||||
|
#You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
#http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
#Unless required by applicable law or agreed to in writing, software
|
||||||
|
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
#See the License for the specific language governing permissions and
|
||||||
|
#limitations under the License.
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torch.optim
|
||||||
|
import torch.optim as optim
|
||||||
|
import torch.utils.data as data
|
||||||
|
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
||||||
|
from torchtext import data as torchtext_data
|
||||||
|
from torchtext import datasets
|
||||||
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
from pplm_classification_head import ClassificationHead
|
||||||
|
|
||||||
|
torch.manual_seed(0)
|
||||||
|
np.random.seed(0)
|
||||||
|
EPSILON = 1e-10
|
||||||
|
example_sentence = "This is incredible! I love it, this is the best chicken I have ever had."
|
||||||
|
max_length_seq = 100
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Discriminator(torch.nn.Module):
|
||||||
|
"""Transformer encoder followed by a Classification Head"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
class_size,
|
||||||
|
pretrained_model="gpt2-medium",
|
||||||
|
cached_mode=False,
|
||||||
|
device='cpu'
|
||||||
|
):
|
||||||
|
super(Discriminator, self).__init__()
|
||||||
|
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
||||||
|
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
|
||||||
|
self.embed_size = self.encoder.transformer.config.hidden_size
|
||||||
|
self.classifier_head = ClassificationHead(
|
||||||
|
class_size=class_size,
|
||||||
|
embed_size=self.embed_size
|
||||||
|
)
|
||||||
|
self.cached_mode = cached_mode
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
def get_classifier(self):
|
||||||
|
return self.classifier_head
|
||||||
|
|
||||||
|
def train_custom(self):
|
||||||
|
for param in self.encoder.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
self.classifier_head.train()
|
||||||
|
|
||||||
|
def avg_representation(self, x):
|
||||||
|
mask = x.ne(0).unsqueeze(2).repeat(
|
||||||
|
1, 1, self.embed_size
|
||||||
|
).float().to(self.device).detach()
|
||||||
|
hidden, _ = self.encoder.transformer(x)
|
||||||
|
masked_hidden = hidden * mask
|
||||||
|
avg_hidden = torch.sum(masked_hidden, dim=1) / (
|
||||||
|
torch.sum(mask, dim=1).detach() + EPSILON
|
||||||
|
)
|
||||||
|
return avg_hidden
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.cached_mode:
|
||||||
|
avg_hidden = x.to(self.device)
|
||||||
|
else:
|
||||||
|
avg_hidden = self.avg_representation(x.to(self.device))
|
||||||
|
|
||||||
|
logits = self.classifier_head(avg_hidden)
|
||||||
|
probs = F.log_softmax(logits, dim=-1)
|
||||||
|
|
||||||
|
return probs
|
||||||
|
|
||||||
|
|
||||||
|
class Dataset(data.Dataset):
|
||||||
|
def __init__(self, X, y):
|
||||||
|
"""Reads source and target sequences from txt files."""
|
||||||
|
self.X = X
|
||||||
|
self.y = y
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.X)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
"""Returns one data pair (source and target)."""
|
||||||
|
data = {}
|
||||||
|
data["X"] = self.X[index]
|
||||||
|
data["y"] = self.y[index]
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def collate_fn(data):
|
||||||
|
def pad_sequences(sequences):
|
||||||
|
lengths = [len(seq) for seq in sequences]
|
||||||
|
|
||||||
|
padded_sequences = torch.zeros(
|
||||||
|
len(sequences),
|
||||||
|
max(lengths)
|
||||||
|
).long() # padding value = 0
|
||||||
|
|
||||||
|
for i, seq in enumerate(sequences):
|
||||||
|
end = lengths[i]
|
||||||
|
padded_sequences[i, :end] = seq[:end]
|
||||||
|
|
||||||
|
return padded_sequences, lengths
|
||||||
|
|
||||||
|
item_info = {}
|
||||||
|
for key in data[0].keys():
|
||||||
|
item_info[key] = [d[key] for d in data]
|
||||||
|
|
||||||
|
x_batch, _ = pad_sequences(item_info["X"])
|
||||||
|
y_batch = torch.tensor(item_info["y"], dtype=torch.long)
|
||||||
|
|
||||||
|
return x_batch, y_batch
|
||||||
|
|
||||||
|
|
||||||
|
def cached_collate_fn(data):
|
||||||
|
item_info = {}
|
||||||
|
for key in data[0].keys():
|
||||||
|
item_info[key] = [d[key] for d in data]
|
||||||
|
|
||||||
|
x_batch = torch.cat(item_info["X"], 0)
|
||||||
|
y_batch = torch.tensor(item_info["y"], dtype=torch.long)
|
||||||
|
|
||||||
|
return x_batch, y_batch
|
||||||
|
|
||||||
|
|
||||||
|
def train_epoch(data_loader, discriminator, optimizer,
|
||||||
|
epoch=0, log_interval=10, device='cpu'):
|
||||||
|
samples_so_far = 0
|
||||||
|
discriminator.train_custom()
|
||||||
|
for batch_idx, (input_t, target_t) in enumerate(data_loader):
|
||||||
|
input_t, target_t = input_t.to(device), target_t.to(device)
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
output_t = discriminator(input_t)
|
||||||
|
loss = F.nll_loss(output_t, target_t)
|
||||||
|
loss.backward(retain_graph=True)
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
samples_so_far += len(input_t)
|
||||||
|
|
||||||
|
if batch_idx % log_interval == 0:
|
||||||
|
print(
|
||||||
|
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
|
||||||
|
epoch + 1,
|
||||||
|
samples_so_far, len(data_loader.dataset),
|
||||||
|
100 * samples_so_far / len(data_loader.dataset), loss.item()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_performance(data_loader, discriminator, device='cpu'):
|
||||||
|
discriminator.eval()
|
||||||
|
test_loss = 0
|
||||||
|
correct = 0
|
||||||
|
with torch.no_grad():
|
||||||
|
for input_t, target_t in data_loader:
|
||||||
|
input_t, target_t = input_t.to(device), target_t.to(device)
|
||||||
|
output_t = discriminator(input_t)
|
||||||
|
# sum up batch loss
|
||||||
|
test_loss += F.nll_loss(output_t, target_t, reduction="sum").item()
|
||||||
|
# get the index of the max log-probability
|
||||||
|
pred_t = output_t.argmax(dim=1, keepdim=True)
|
||||||
|
correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
|
||||||
|
|
||||||
|
test_loss /= len(data_loader.dataset)
|
||||||
|
|
||||||
|
print(
|
||||||
|
"Performance on test set: "
|
||||||
|
"Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
|
||||||
|
test_loss, correct, len(data_loader.dataset),
|
||||||
|
100. * correct / len(data_loader.dataset)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def predict(input_sentence, model, classes, cached=False, device='cpu'):
|
||||||
|
input_t = model.tokenizer.encode(input_sentence)
|
||||||
|
input_t = torch.tensor([input_t], dtype=torch.long, device=device)
|
||||||
|
if cached:
|
||||||
|
input_t = model.avg_representation(input_t)
|
||||||
|
|
||||||
|
log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
|
||||||
|
print("Input sentence:", input_sentence)
|
||||||
|
print("Predictions:", ", ".join(
|
||||||
|
"{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in
|
||||||
|
zip(classes, log_probs)
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_data_loader(dataset, batch_size, discriminator,
|
||||||
|
shuffle=False, device='cpu'):
|
||||||
|
data_loader = torch.utils.data.DataLoader(dataset=dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
collate_fn=collate_fn)
|
||||||
|
|
||||||
|
xs = []
|
||||||
|
ys = []
|
||||||
|
for batch_idx, (x, y) in enumerate(tqdm(data_loader, ascii=True)):
|
||||||
|
with torch.no_grad():
|
||||||
|
x = x.to(device)
|
||||||
|
avg_rep = discriminator.avg_representation(x).cpu().detach()
|
||||||
|
avg_rep_list = torch.unbind(avg_rep.unsqueeze(1))
|
||||||
|
xs += avg_rep_list
|
||||||
|
ys += y.cpu().numpy().tolist()
|
||||||
|
|
||||||
|
data_loader = torch.utils.data.DataLoader(
|
||||||
|
dataset=Dataset(xs, ys),
|
||||||
|
batch_size=batch_size,
|
||||||
|
shuffle=shuffle,
|
||||||
|
collate_fn=cached_collate_fn)
|
||||||
|
|
||||||
|
return data_loader
|
||||||
|
|
||||||
|
|
||||||
|
def train_discriminator(
|
||||||
|
dataset, dataset_fp=None, pretrained_model="gpt2-medium",
|
||||||
|
epochs=10, batch_size=64, log_interval=10,
|
||||||
|
save_model=False, cached=False, no_cuda=False):
|
||||||
|
device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
|
||||||
|
|
||||||
|
print("Preprocessing {} dataset...".format(dataset))
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
if dataset == "SST":
|
||||||
|
idx2class = ["positive", "negative", "very positive", "very negative",
|
||||||
|
"neutral"]
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
text = torchtext_data.Field()
|
||||||
|
label = torchtext_data.Field(sequential=False)
|
||||||
|
train_data, val_data, test_data = datasets.SST.splits(
|
||||||
|
text,
|
||||||
|
label,
|
||||||
|
fine_grained=True,
|
||||||
|
train_subtrees=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
for i in trange(len(train_data), ascii=True):
|
||||||
|
seq = TreebankWordDetokenizer().detokenize(
|
||||||
|
vars(train_data[i])["text"]
|
||||||
|
)
|
||||||
|
seq = discriminator.tokenizer.encode(seq)
|
||||||
|
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||||
|
x.append(seq)
|
||||||
|
y.append(class2idx[vars(train_data[i])["label"]])
|
||||||
|
train_dataset = Dataset(x, y)
|
||||||
|
|
||||||
|
test_x = []
|
||||||
|
test_y = []
|
||||||
|
for i in trange(len(test_data), ascii=True):
|
||||||
|
seq = TreebankWordDetokenizer().detokenize(
|
||||||
|
vars(test_data[i])["text"]
|
||||||
|
)
|
||||||
|
seq = discriminator.tokenizer.encode(seq)
|
||||||
|
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||||
|
test_x.append(seq)
|
||||||
|
test_y.append(class2idx[vars(test_data[i])["label"]])
|
||||||
|
test_dataset = Dataset(test_x, test_y)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
elif dataset == "clickbait":
|
||||||
|
idx2class = ["non_clickbait", "clickbait"]
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
|
||||||
|
data = []
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
try:
|
||||||
|
data.append(eval(line))
|
||||||
|
except:
|
||||||
|
print("Error evaluating line {}: {}".format(
|
||||||
|
i, line
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
|
||||||
|
for i, line in enumerate(tqdm(f, ascii=True)):
|
||||||
|
try:
|
||||||
|
d = eval(line)
|
||||||
|
seq = discriminator.tokenizer.encode(d["text"])
|
||||||
|
|
||||||
|
if len(seq) < max_length_seq:
|
||||||
|
seq = torch.tensor(
|
||||||
|
[50256] + seq, device=device, dtype=torch.long
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("Line {} is longer than maximum length {}".format(
|
||||||
|
i, max_length_seq
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
x.append(seq)
|
||||||
|
y.append(d["label"])
|
||||||
|
except:
|
||||||
|
print("Error evaluating / tokenizing"
|
||||||
|
" line {}, skipping it".format(i))
|
||||||
|
pass
|
||||||
|
|
||||||
|
full_dataset = Dataset(x, y)
|
||||||
|
train_size = int(0.9 * len(full_dataset))
|
||||||
|
test_size = len(full_dataset) - train_size
|
||||||
|
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||||
|
full_dataset, [train_size, test_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
elif dataset == "toxic":
|
||||||
|
idx2class = ["non_toxic", "toxic"]
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
with open("datasets/toxic/toxic_train.txt") as f:
|
||||||
|
for i, line in enumerate(tqdm(f, ascii=True)):
|
||||||
|
try:
|
||||||
|
d = eval(line)
|
||||||
|
seq = discriminator.tokenizer.encode(d["text"])
|
||||||
|
|
||||||
|
if len(seq) < max_length_seq:
|
||||||
|
seq = torch.tensor(
|
||||||
|
[50256] + seq, device=device, dtype=torch.long
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("Line {} is longer than maximum length {}".format(
|
||||||
|
i, max_length_seq
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
x.append(seq)
|
||||||
|
y.append(int(np.sum(d["label"]) > 0))
|
||||||
|
except:
|
||||||
|
print("Error evaluating / tokenizing"
|
||||||
|
" line {}, skipping it".format(i))
|
||||||
|
pass
|
||||||
|
|
||||||
|
full_dataset = Dataset(x, y)
|
||||||
|
train_size = int(0.9 * len(full_dataset))
|
||||||
|
test_size = len(full_dataset) - train_size
|
||||||
|
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||||
|
full_dataset, [train_size, test_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
else: # if dataset == "generic":
|
||||||
|
# This assumes the input dataset is a TSV with the following structure:
|
||||||
|
# class \t text
|
||||||
|
|
||||||
|
if dataset_fp is None:
|
||||||
|
raise ValueError("When generic dataset is selected, "
|
||||||
|
"dataset_fp needs to be specified aswell.")
|
||||||
|
|
||||||
|
classes = set()
|
||||||
|
with open(dataset_fp) as f:
|
||||||
|
csv_reader = csv.reader(f, delimiter="\t")
|
||||||
|
for row in tqdm(csv_reader, ascii=True):
|
||||||
|
if row:
|
||||||
|
classes.add(row[0])
|
||||||
|
|
||||||
|
idx2class = sorted(classes)
|
||||||
|
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||||
|
|
||||||
|
discriminator = Discriminator(
|
||||||
|
class_size=len(idx2class),
|
||||||
|
pretrained_model=pretrained_model,
|
||||||
|
cached_mode=cached,
|
||||||
|
device=device
|
||||||
|
).to(device)
|
||||||
|
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
with open(dataset_fp) as f:
|
||||||
|
csv_reader = csv.reader(f, delimiter="\t")
|
||||||
|
for i, row in enumerate(tqdm(csv_reader, ascii=True)):
|
||||||
|
if row:
|
||||||
|
label = row[0]
|
||||||
|
text = row[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
seq = discriminator.tokenizer.encode(text)
|
||||||
|
if (len(seq) < max_length_seq):
|
||||||
|
seq = torch.tensor(
|
||||||
|
[50256] + seq,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.long
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"Line {} is longer than maximum length {}".format(
|
||||||
|
i, max_length_seq
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
x.append(seq)
|
||||||
|
y.append(class2idx[label])
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("Error tokenizing line {}, skipping it".format(i))
|
||||||
|
pass
|
||||||
|
|
||||||
|
full_dataset = Dataset(x, y)
|
||||||
|
train_size = int(0.9 * len(full_dataset))
|
||||||
|
test_size = len(full_dataset) - train_size
|
||||||
|
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||||
|
full_dataset,
|
||||||
|
[train_size, test_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
discriminator_meta = {
|
||||||
|
"class_size": len(idx2class),
|
||||||
|
"embed_size": discriminator.embed_size,
|
||||||
|
"pretrained_model": pretrained_model,
|
||||||
|
"class_vocab": class2idx,
|
||||||
|
"default_class": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("Preprocessed {} data points".format(
|
||||||
|
len(train_dataset) + len(test_dataset))
|
||||||
|
)
|
||||||
|
print("Data preprocessing took: {:.3f}s".format(end - start))
|
||||||
|
|
||||||
|
if cached:
|
||||||
|
print("Building representation cache...")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
train_loader = get_cached_data_loader(
|
||||||
|
train_dataset, batch_size, discriminator,
|
||||||
|
shuffle=True, device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
test_loader = get_cached_data_loader(
|
||||||
|
test_dataset, batch_size, discriminator, device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("Building representation cache took: {:.3f}s".format(end - start))
|
||||||
|
|
||||||
|
else:
|
||||||
|
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
shuffle=True,
|
||||||
|
collate_fn=collate_fn)
|
||||||
|
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
collate_fn=collate_fn)
|
||||||
|
|
||||||
|
if save_model:
|
||||||
|
with open("{}_classifier_head_meta.json".format(dataset),
|
||||||
|
"w") as meta_file:
|
||||||
|
json.dump(discriminator_meta, meta_file)
|
||||||
|
|
||||||
|
optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
|
||||||
|
|
||||||
|
for epoch in range(epochs):
|
||||||
|
start = time.time()
|
||||||
|
print("\nEpoch", epoch + 1)
|
||||||
|
|
||||||
|
train_epoch(
|
||||||
|
discriminator=discriminator,
|
||||||
|
data_loader=train_loader,
|
||||||
|
optimizer=optimizer,
|
||||||
|
epoch=epoch,
|
||||||
|
log_interval=log_interval,
|
||||||
|
device=device
|
||||||
|
)
|
||||||
|
evaluate_performance(
|
||||||
|
data_loader=test_loader,
|
||||||
|
discriminator=discriminator,
|
||||||
|
device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("Epoch took: {:.3f}s".format(end - start))
|
||||||
|
|
||||||
|
print("\nExample prediction")
|
||||||
|
predict(example_sentence, discriminator, idx2class,
|
||||||
|
cached=cached, device=device)
|
||||||
|
|
||||||
|
if save_model:
|
||||||
|
# torch.save(discriminator.state_dict(),
|
||||||
|
# "{}_discriminator_{}.pt".format(
|
||||||
|
# args.dataset, epoch + 1
|
||||||
|
# ))
|
||||||
|
torch.save(discriminator.get_classifier().state_dict(),
|
||||||
|
"{}_classifier_head_epoch_{}.pt".format(dataset,
|
||||||
|
epoch + 1))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Train a discriminator on top of GPT-2 representations")
|
||||||
|
parser.add_argument("--dataset", type=str, default="SST",
|
||||||
|
choices=("SST", "clickbait", "toxic", "generic"),
|
||||||
|
help="dataset to train the discriminator on."
|
||||||
|
"In case of generic, the dataset is expected"
|
||||||
|
"to be a TSBV file with structure: class \\t text")
|
||||||
|
parser.add_argument("--dataset_fp", type=str, default="",
|
||||||
|
help="File path of the dataset to use. "
|
||||||
|
"Needed only in case of generic datadset")
|
||||||
|
parser.add_argument("--pretrained_model", type=str, default="gpt2-medium",
|
||||||
|
help="Pretrained model to use as encoder")
|
||||||
|
parser.add_argument("--epochs", type=int, default=10, metavar="N",
|
||||||
|
help="Number of training epochs")
|
||||||
|
parser.add_argument("--batch_size", type=int, default=64, metavar="N",
|
||||||
|
help="input batch size for training (default: 64)")
|
||||||
|
parser.add_argument("--log_interval", type=int, default=10, metavar="N",
|
||||||
|
help="how many batches to wait before logging training status")
|
||||||
|
parser.add_argument("--save_model", action="store_true",
|
||||||
|
help="whether to save the model")
|
||||||
|
parser.add_argument("--cached", action="store_true",
|
||||||
|
help="whether to cache the input representations")
|
||||||
|
parser.add_argument("--no_cuda", action="store_true",
|
||||||
|
help="use to turn off cuda")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
train_discriminator(**(vars(args)))
|
||||||
@@ -39,8 +39,9 @@ from transformers import (WEIGHTS_NAME,
|
|||||||
|
|
||||||
from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
|
from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
|
||||||
|
|
||||||
from utils_glue import (compute_metrics, convert_examples_to_features,
|
from transformers import glue_compute_metrics as compute_metrics
|
||||||
output_modes, processors)
|
from transformers import glue_output_modes as output_modes
|
||||||
|
from transformers import glue_processors as processors
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -233,6 +234,8 @@ def main():
|
|||||||
help="If > 0: limit the data to a subset of data_subset instances.")
|
help="If > 0: limit the data to a subset of data_subset instances.")
|
||||||
parser.add_argument("--overwrite_output_dir", action='store_true',
|
parser.add_argument("--overwrite_output_dir", action='store_true',
|
||||||
help="Whether to overwrite data in output directory")
|
help="Whether to overwrite data in output directory")
|
||||||
|
parser.add_argument('--overwrite_cache', action='store_true',
|
||||||
|
help="Overwrite the cached training and evaluation sets")
|
||||||
|
|
||||||
parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
|
parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
|
||||||
help="Don't normalize importance score by layers")
|
help="Don't normalize importance score by layers")
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import glob
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import json
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@@ -47,9 +48,13 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig,
|
DistilBertConfig,
|
||||||
DistilBertForSequenceClassification,
|
DistilBertForSequenceClassification,
|
||||||
DistilBertTokenizer)
|
DistilBertTokenizer,
|
||||||
|
AlbertConfig,
|
||||||
|
AlbertForSequenceClassification,
|
||||||
|
AlbertTokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from transformers import glue_compute_metrics as compute_metrics
|
from transformers import glue_compute_metrics as compute_metrics
|
||||||
from transformers import glue_output_modes as output_modes
|
from transformers import glue_output_modes as output_modes
|
||||||
@@ -66,7 +71,8 @@ MODEL_CLASSES = {
|
|||||||
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
||||||
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
|
||||||
|
'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -99,8 +105,9 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
|
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
@@ -158,7 +165,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
loss.backward()
|
loss.backward()
|
||||||
|
|
||||||
tr_loss += loss.item()
|
tr_loss += loss.item()
|
||||||
if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
|
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||||
else:
|
else:
|
||||||
@@ -170,15 +177,23 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||||
# Log metrics
|
logs = {}
|
||||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
results = evaluate(args, model, tokenizer)
|
results = evaluate(args, model, tokenizer)
|
||||||
for key, value in results.items():
|
for key, value in results.items():
|
||||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
eval_key = 'eval_{}'.format(key)
|
||||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
logs[eval_key] = value
|
||||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
|
||||||
|
loss_scalar = (tr_loss - logging_loss) / args.logging_steps
|
||||||
|
learning_rate_scalar = scheduler.get_lr()[0]
|
||||||
|
logs['learning_rate'] = learning_rate_scalar
|
||||||
|
logs['loss'] = loss_scalar
|
||||||
logging_loss = tr_loss
|
logging_loss = tr_loss
|
||||||
|
|
||||||
|
for key, value in logs.items():
|
||||||
|
tb_writer.add_scalar(key, value, global_step)
|
||||||
|
print(json.dumps({**logs, **{'step': global_step}}))
|
||||||
|
|
||||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||||
# Save model checkpoint
|
# Save model checkpoint
|
||||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||||
@@ -189,11 +204,6 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||||
logger.info("Saving model checkpoint to %s", output_dir)
|
logger.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
|
||||||
if args.tpu:
|
|
||||||
args.xla_model.optimizer_step(optimizer, barrier=True)
|
|
||||||
model.zero_grad()
|
|
||||||
global_step += 1
|
|
||||||
|
|
||||||
if args.max_steps > 0 and global_step > args.max_steps:
|
if args.max_steps > 0 and global_step > args.max_steps:
|
||||||
epoch_iterator.close()
|
epoch_iterator.close()
|
||||||
break
|
break
|
||||||
@@ -221,9 +231,13 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
|
# multi-gpu eval
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Eval!
|
# Eval!
|
||||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||||
logger.info(" Num examples = %d", len(eval_dataset))
|
logger.info(" Num examples = %d", len(eval_dataset))
|
||||||
@@ -318,7 +332,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||||
elif output_mode == "regression":
|
elif output_mode == "regression":
|
||||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||||
|
|
||||||
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
@@ -362,7 +376,7 @@ def main():
|
|||||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||||
help="Batch size per GPU/CPU for evaluation.")
|
help="Batch size per GPU/CPU for evaluation.")
|
||||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||||
help="The initial learning rate for Adam.")
|
help="The initial learning rate for Adam.")
|
||||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||||
@@ -393,15 +407,6 @@ def main():
|
|||||||
parser.add_argument('--seed', type=int, default=42,
|
parser.add_argument('--seed', type=int, default=42,
|
||||||
help="random seed for initialization")
|
help="random seed for initialization")
|
||||||
|
|
||||||
parser.add_argument('--tpu', action='store_true',
|
|
||||||
help="Whether to run on the TPU defined in the environment variables")
|
|
||||||
parser.add_argument('--tpu_ip_address', type=str, default='',
|
|
||||||
help="TPU IP address if none are set in the environment variables")
|
|
||||||
parser.add_argument('--tpu_name', type=str, default='',
|
|
||||||
help="TPU name if none are set in the environment variables")
|
|
||||||
parser.add_argument('--xrt_tpu_config', type=str, default='',
|
|
||||||
help="XRT TPU config if none are set in the environment variables")
|
|
||||||
|
|
||||||
parser.add_argument('--fp16', action='store_true',
|
parser.add_argument('--fp16', action='store_true',
|
||||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||||
@@ -435,23 +440,6 @@ def main():
|
|||||||
args.n_gpu = 1
|
args.n_gpu = 1
|
||||||
args.device = device
|
args.device = device
|
||||||
|
|
||||||
if args.tpu:
|
|
||||||
if args.tpu_ip_address:
|
|
||||||
os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address
|
|
||||||
if args.tpu_name:
|
|
||||||
os.environ["TPU_NAME"] = args.tpu_name
|
|
||||||
if args.xrt_tpu_config:
|
|
||||||
os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config
|
|
||||||
|
|
||||||
assert "TPU_IP_ADDRESS" in os.environ
|
|
||||||
assert "TPU_NAME" in os.environ
|
|
||||||
assert "XRT_TPU_CONFIG" in os.environ
|
|
||||||
|
|
||||||
import torch_xla
|
|
||||||
import torch_xla.core.xla_model as xm
|
|
||||||
args.device = xm.xla_device()
|
|
||||||
args.xla_model = xm
|
|
||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
@@ -505,7 +493,7 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu:
|
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||||
# Create output directory if needed
|
# Create output directory if needed
|
||||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||||
os.makedirs(args.output_dir)
|
os.makedirs(args.output_dir)
|
||||||
|
|||||||
@@ -42,12 +42,13 @@ except:
|
|||||||
|
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
|
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
|
||||||
BertConfig, BertForMaskedLM, BertTokenizer,
|
BertConfig, BertForMaskedLM, BertTokenizer,
|
||||||
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
||||||
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
||||||
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
|
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
|
||||||
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
|
||||||
|
CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -58,17 +59,18 @@ MODEL_CLASSES = {
|
|||||||
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||||
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||||
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||||
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
|
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||||
|
'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class TextDataset(Dataset):
|
class TextDataset(Dataset):
|
||||||
def __init__(self, tokenizer, file_path='train', block_size=512):
|
def __init__(self, tokenizer, args, file_path='train', block_size=512):
|
||||||
assert os.path.isfile(file_path)
|
assert os.path.isfile(file_path)
|
||||||
directory, filename = os.path.split(file_path)
|
directory, filename = os.path.split(file_path)
|
||||||
cached_features_file = os.path.join(directory, 'cached_lm_' + str(block_size) + '_' + filename)
|
cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)
|
||||||
|
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
with open(cached_features_file, 'rb') as handle:
|
with open(cached_features_file, 'rb') as handle:
|
||||||
self.examples = pickle.load(handle)
|
self.examples = pickle.load(handle)
|
||||||
@@ -99,7 +101,7 @@ class TextDataset(Dataset):
|
|||||||
|
|
||||||
|
|
||||||
def load_and_cache_examples(args, tokenizer, evaluate=False):
|
def load_and_cache_examples(args, tokenizer, evaluate=False):
|
||||||
dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
|
dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
@@ -185,7 +187,14 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
|
|
||||||
|
# Check if saved optimizer or scheduler states exist
|
||||||
|
if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
|
||||||
|
# Load in optimizer and scheduler states
|
||||||
|
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
|
||||||
|
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
|
||||||
|
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
@@ -214,13 +223,37 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
logger.info(" Total optimization steps = %d", t_total)
|
logger.info(" Total optimization steps = %d", t_total)
|
||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
|
epochs_trained = 0
|
||||||
|
steps_trained_in_current_epoch = 0
|
||||||
|
# Check if continuing training from a checkpoint
|
||||||
|
if os.path.exists(args.model_name_or_path):
|
||||||
|
# set global_step to gobal_step of last saved checkpoint from model path
|
||||||
|
global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
|
||||||
|
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||||
|
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||||
|
|
||||||
|
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||||
|
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||||
|
logger.info(" Continuing training from global step %d", global_step)
|
||||||
|
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||||
|
|
||||||
tr_loss, logging_loss = 0.0, 0.0
|
tr_loss, logging_loss = 0.0, 0.0
|
||||||
|
|
||||||
|
model_to_resize = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
|
model_to_resize.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||||
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
||||||
for _ in train_iterator:
|
for _ in train_iterator:
|
||||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||||
for step, batch in enumerate(epoch_iterator):
|
for step, batch in enumerate(epoch_iterator):
|
||||||
|
|
||||||
|
# Skip past any already trained steps if resuming training
|
||||||
|
if steps_trained_in_current_epoch > 0:
|
||||||
|
steps_trained_in_current_epoch -= 1
|
||||||
|
continue
|
||||||
|
|
||||||
inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
|
inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
|
||||||
inputs = inputs.to(args.device)
|
inputs = inputs.to(args.device)
|
||||||
labels = labels.to(args.device)
|
labels = labels.to(args.device)
|
||||||
@@ -268,11 +301,17 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
model_to_save.save_pretrained(output_dir)
|
model_to_save.save_pretrained(output_dir)
|
||||||
|
tokenizer.save_pretrained(output_dir)
|
||||||
|
|
||||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||||
logger.info("Saving model checkpoint to %s", output_dir)
|
logger.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
|
||||||
_rotate_checkpoints(args, checkpoint_prefix)
|
_rotate_checkpoints(args, checkpoint_prefix)
|
||||||
|
|
||||||
|
torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
|
||||||
|
torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
|
||||||
|
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||||
|
|
||||||
if args.max_steps > 0 and global_step > args.max_steps:
|
if args.max_steps > 0 and global_step > args.max_steps:
|
||||||
epoch_iterator.close()
|
epoch_iterator.close()
|
||||||
break
|
break
|
||||||
@@ -297,9 +336,13 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
|
# multi-gpu evaluate
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Eval!
|
# Eval!
|
||||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||||
logger.info(" Num examples = %d", len(eval_dataset))
|
logger.info(" Num examples = %d", len(eval_dataset))
|
||||||
@@ -427,7 +470,7 @@ def main():
|
|||||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
|
if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
|
||||||
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||||
"flag (masked language modeling).")
|
"flag (masked language modeling).")
|
||||||
if args.eval_data_file is None and args.do_eval:
|
if args.eval_data_file is None and args.do_eval:
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer, RobertaConfig,
|
XLNetTokenizer, RobertaConfig,
|
||||||
RobertaForMultipleChoice, RobertaTokenizer)
|
RobertaForMultipleChoice, RobertaTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from utils_multiple_choice import (convert_examples_to_features, processors)
|
from utils_multiple_choice import (convert_examples_to_features, processors)
|
||||||
|
|
||||||
@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
@@ -226,9 +226,13 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
|
|||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
|
# multi-gpu evaluate
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Eval!
|
# Eval!
|
||||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||||
logger.info(" Num examples = %d", len(eval_dataset))
|
logger.info(" Num examples = %d", len(eval_dataset))
|
||||||
|
|||||||
@@ -33,19 +33,23 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
|
from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
|
||||||
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
|
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
|
||||||
|
from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
|
||||||
|
from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
ALL_MODELS = sum(
|
ALL_MODELS = sum(
|
||||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig)),
|
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
|
||||||
())
|
())
|
||||||
|
|
||||||
MODEL_CLASSES = {
|
MODEL_CLASSES = {
|
||||||
"bert": (BertConfig, BertForTokenClassification, BertTokenizer),
|
"bert": (BertConfig, BertForTokenClassification, BertTokenizer),
|
||||||
"roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer)
|
"roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
|
||||||
|
"distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
|
||||||
|
"camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -80,7 +84,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
|||||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
|
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
@@ -121,9 +125,10 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
|||||||
batch = tuple(t.to(args.device) for t in batch)
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
inputs = {"input_ids": batch[0],
|
inputs = {"input_ids": batch[0],
|
||||||
"attention_mask": batch[1],
|
"attention_mask": batch[1],
|
||||||
"token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
|
|
||||||
# XLM and RoBERTa don"t use segment_ids
|
|
||||||
"labels": batch[3]}
|
"labels": batch[3]}
|
||||||
|
if args.model_type != "distilbert":
|
||||||
|
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
||||||
|
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
||||||
|
|
||||||
@@ -191,6 +196,10 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
|
|||||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
||||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
|
# multi-gpu evaluate
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Eval!
|
# Eval!
|
||||||
logger.info("***** Running evaluation %s *****", prefix)
|
logger.info("***** Running evaluation %s *****", prefix)
|
||||||
logger.info(" Num examples = %d", len(eval_dataset))
|
logger.info(" Num examples = %d", len(eval_dataset))
|
||||||
@@ -206,9 +215,9 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
inputs = {"input_ids": batch[0],
|
inputs = {"input_ids": batch[0],
|
||||||
"attention_mask": batch[1],
|
"attention_mask": batch[1],
|
||||||
"token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
|
|
||||||
# XLM and RoBERTa don"t use segment_ids
|
|
||||||
"labels": batch[3]}
|
"labels": batch[3]}
|
||||||
|
if args.model_type != "distilbert":
|
||||||
|
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
tmp_eval_loss, logits = outputs[:2]
|
tmp_eval_loss, logits = outputs[:2]
|
||||||
|
|
||||||
@@ -520,3 +529,4 @@ def main():
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,8 @@
|
|||||||
""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
|
""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
|
||||||
|
from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
@@ -23,11 +25,9 @@ import os
|
|||||||
import random
|
import random
|
||||||
import glob
|
import glob
|
||||||
import timeit
|
import timeit
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
|
||||||
TensorDataset)
|
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -43,18 +43,12 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLMTokenizer, XLNetConfig,
|
XLMTokenizer, XLNetConfig,
|
||||||
XLNetForQuestionAnswering,
|
XLNetForQuestionAnswering,
|
||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
|
||||||
|
AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
|
||||||
|
XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
|
||||||
|
|
||||||
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
|
||||||
RawResult, write_predictions,
|
|
||||||
RawResultExtended, write_predictions_extended)
|
|
||||||
|
|
||||||
# The follwing import is the official SQuAD evaluation script (2.0).
|
|
||||||
# You can remove it from the dependencies if you are using this script outside of the library
|
|
||||||
# We've added it here for automated tests (see examples/test_examples.py file)
|
|
||||||
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -65,7 +59,9 @@ MODEL_CLASSES = {
|
|||||||
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
||||||
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
||||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||||
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
|
||||||
|
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
|
||||||
|
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
def set_seed(args):
|
def set_seed(args):
|
||||||
@@ -98,14 +94,16 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
optimizer_grouped_parameters = [
|
optimizer_grouped_parameters = [
|
||||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
|
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||||
|
|
||||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||||
|
|
||||||
# multi-gpu training (should be after apex fp16 initialization)
|
# multi-gpu training (should be after apex fp16 initialization)
|
||||||
@@ -128,25 +126,31 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||||
logger.info(" Total optimization steps = %d", t_total)
|
logger.info(" Total optimization steps = %d", t_total)
|
||||||
|
|
||||||
global_step = 0
|
global_step = 1
|
||||||
tr_loss, logging_loss = 0.0, 0.0
|
tr_loss, logging_loss = 0.0, 0.0
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||||
|
|
||||||
for _ in train_iterator:
|
for _ in train_iterator:
|
||||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||||
for step, batch in enumerate(epoch_iterator):
|
for step, batch in enumerate(epoch_iterator):
|
||||||
model.train()
|
model.train()
|
||||||
batch = tuple(t.to(args.device) for t in batch)
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
inputs = {'input_ids': batch[0],
|
|
||||||
'attention_mask': batch[1],
|
inputs = {
|
||||||
'start_positions': batch[3],
|
'input_ids': batch[0],
|
||||||
'end_positions': batch[4]}
|
'attention_mask': batch[1],
|
||||||
|
'start_positions': batch[3],
|
||||||
|
'end_positions': batch[4]
|
||||||
|
}
|
||||||
|
|
||||||
if args.model_type != 'distilbert':
|
if args.model_type != 'distilbert':
|
||||||
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
|
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
|
||||||
|
|
||||||
if args.model_type in ['xlnet', 'xlm']:
|
if args.model_type in ['xlnet', 'xlm']:
|
||||||
inputs.update({'cls_index': batch[5],
|
inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
|
||||||
'p_mask': batch[6]})
|
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
@@ -173,8 +177,8 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
|
# Log metrics
|
||||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||||
# Log metrics
|
|
||||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
results = evaluate(args, model, tokenizer)
|
results = evaluate(args, model, tokenizer)
|
||||||
for key, value in results.items():
|
for key, value in results.items():
|
||||||
@@ -183,8 +187,8 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||||
logging_loss = tr_loss
|
logging_loss = tr_loss
|
||||||
|
|
||||||
|
# Save model checkpoint
|
||||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||||
# Save model checkpoint
|
|
||||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
@@ -213,46 +217,72 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
os.makedirs(args.output_dir)
|
os.makedirs(args.output_dir)
|
||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
|
|
||||||
# Note that DistributedSampler samples randomly
|
# Note that DistributedSampler samples randomly
|
||||||
eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
|
eval_sampler = SequentialSampler(dataset)
|
||||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
|
# multi-gpu evaluate
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Eval!
|
# Eval!
|
||||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||||
logger.info(" Num examples = %d", len(dataset))
|
logger.info(" Num examples = %d", len(dataset))
|
||||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||||
|
|
||||||
all_results = []
|
all_results = []
|
||||||
start_time = timeit.default_timer()
|
start_time = timeit.default_timer()
|
||||||
|
|
||||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||||
model.eval()
|
model.eval()
|
||||||
batch = tuple(t.to(args.device) for t in batch)
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
inputs = {'input_ids': batch[0],
|
inputs = {
|
||||||
'attention_mask': batch[1]
|
'input_ids': batch[0],
|
||||||
}
|
'attention_mask': batch[1]
|
||||||
|
}
|
||||||
|
|
||||||
if args.model_type != 'distilbert':
|
if args.model_type != 'distilbert':
|
||||||
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
|
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
|
||||||
|
|
||||||
example_indices = batch[3]
|
example_indices = batch[3]
|
||||||
|
|
||||||
|
# XLNet and XLM use more arguments for their predictions
|
||||||
if args.model_type in ['xlnet', 'xlm']:
|
if args.model_type in ['xlnet', 'xlm']:
|
||||||
inputs.update({'cls_index': batch[4],
|
inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
|
||||||
'p_mask': batch[5]})
|
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
|
|
||||||
for i, example_index in enumerate(example_indices):
|
for i, example_index in enumerate(example_indices):
|
||||||
eval_feature = features[example_index.item()]
|
eval_feature = features[example_index.item()]
|
||||||
unique_id = int(eval_feature.unique_id)
|
unique_id = int(eval_feature.unique_id)
|
||||||
if args.model_type in ['xlnet', 'xlm']:
|
|
||||||
# XLNet uses a more complex post-processing procedure
|
output = [to_list(output[i]) for output in outputs]
|
||||||
result = RawResultExtended(unique_id = unique_id,
|
|
||||||
start_top_log_probs = to_list(outputs[0][i]),
|
# Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
|
||||||
start_top_index = to_list(outputs[1][i]),
|
# models only use two.
|
||||||
end_top_log_probs = to_list(outputs[2][i]),
|
if len(output) >= 5:
|
||||||
end_top_index = to_list(outputs[3][i]),
|
start_logits = output[0]
|
||||||
cls_logits = to_list(outputs[4][i]))
|
start_top_index = output[1]
|
||||||
|
end_logits = output[2]
|
||||||
|
end_top_index = output[3]
|
||||||
|
cls_logits = output[4]
|
||||||
|
|
||||||
|
result = SquadResult(
|
||||||
|
unique_id, start_logits, end_logits,
|
||||||
|
start_top_index=start_top_index,
|
||||||
|
end_top_index=end_top_index,
|
||||||
|
cls_logits=cls_logits
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
result = RawResult(unique_id = unique_id,
|
start_logits, end_logits = output
|
||||||
start_logits = to_list(outputs[0][i]),
|
result = SquadResult(
|
||||||
end_logits = to_list(outputs[1][i]))
|
unique_id, start_logits, end_logits
|
||||||
|
)
|
||||||
|
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
evalTime = timeit.default_timer() - start_time
|
evalTime = timeit.default_timer() - start_time
|
||||||
@@ -261,84 +291,81 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
# Compute predictions
|
# Compute predictions
|
||||||
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
|
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
|
||||||
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
|
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
|
||||||
|
|
||||||
if args.version_2_with_negative:
|
if args.version_2_with_negative:
|
||||||
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
|
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
|
||||||
else:
|
else:
|
||||||
output_null_log_odds_file = None
|
output_null_log_odds_file = None
|
||||||
|
|
||||||
|
# XLNet and XLM use a more complex post-processing procedure
|
||||||
if args.model_type in ['xlnet', 'xlm']:
|
if args.model_type in ['xlnet', 'xlm']:
|
||||||
# XLNet uses a more complex post-processing procedure
|
predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
|
||||||
write_predictions_extended(examples, features, all_results, args.n_best_size,
|
|
||||||
args.max_answer_length, output_prediction_file,
|
args.max_answer_length, output_prediction_file,
|
||||||
output_nbest_file, output_null_log_odds_file, args.predict_file,
|
output_nbest_file, output_null_log_odds_file,
|
||||||
model.config.start_n_top, model.config.end_n_top,
|
model.config.start_n_top, model.config.end_n_top,
|
||||||
args.version_2_with_negative, tokenizer, args.verbose_logging)
|
args.version_2_with_negative, tokenizer, args.verbose_logging)
|
||||||
else:
|
else:
|
||||||
write_predictions(examples, features, all_results, args.n_best_size,
|
predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
|
||||||
args.max_answer_length, args.do_lower_case, output_prediction_file,
|
args.max_answer_length, args.do_lower_case, output_prediction_file,
|
||||||
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
|
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
|
||||||
args.version_2_with_negative, args.null_score_diff_threshold)
|
args.version_2_with_negative, args.null_score_diff_threshold)
|
||||||
|
|
||||||
# Evaluate with the official SQuAD script
|
# Compute the F1 and exact scores.
|
||||||
evaluate_options = EVAL_OPTS(data_file=args.predict_file,
|
results = squad_evaluate(examples, predictions)
|
||||||
pred_file=output_prediction_file,
|
|
||||||
na_prob_file=output_null_log_odds_file)
|
|
||||||
results = evaluate_on_squad(evaluate_options)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
||||||
if args.local_rank not in [-1, 0] and not evaluate:
|
if args.local_rank not in [-1, 0] and not evaluate:
|
||||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||||
|
|
||||||
# Load data features from cache or dataset file
|
# Load data features from cache or dataset file
|
||||||
input_file = args.predict_file if evaluate else args.train_file
|
input_dir = args.data_dir if args.data_dir else "."
|
||||||
cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
|
cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
|
||||||
'dev' if evaluate else 'train',
|
'dev' if evaluate else 'train',
|
||||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||||
str(args.max_seq_length)))
|
str(args.max_seq_length))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Init features and dataset from cache if it exists
|
||||||
if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
|
if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
features = torch.load(cached_features_file)
|
features_and_dataset = torch.load(cached_features_file)
|
||||||
|
features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
|
||||||
else:
|
else:
|
||||||
logger.info("Creating features from dataset file at %s", input_file)
|
logger.info("Creating features from dataset file at %s", input_dir)
|
||||||
examples = read_squad_examples(input_file=input_file,
|
|
||||||
is_training=not evaluate,
|
if not args.data_dir:
|
||||||
version_2_with_negative=args.version_2_with_negative)
|
try:
|
||||||
features = convert_examples_to_features(examples=examples,
|
import tensorflow_datasets as tfds
|
||||||
tokenizer=tokenizer,
|
except ImportError:
|
||||||
max_seq_length=args.max_seq_length,
|
raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
|
||||||
doc_stride=args.doc_stride,
|
|
||||||
max_query_length=args.max_query_length,
|
if args.version_2_with_negative:
|
||||||
is_training=not evaluate,
|
logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
|
||||||
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
|
|
||||||
pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0,
|
tfds_examples = tfds.load("squad")
|
||||||
cls_token_at_end=True if args.model_type in ['xlnet'] else False,
|
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||||
sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)
|
else:
|
||||||
|
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
|
||||||
|
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||||
|
|
||||||
|
features, dataset = squad_convert_examples_to_features(
|
||||||
|
examples=examples,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
max_seq_length=args.max_seq_length,
|
||||||
|
doc_stride=args.doc_stride,
|
||||||
|
max_query_length=args.max_query_length,
|
||||||
|
is_training=not evaluate,
|
||||||
|
return_dataset='pt'
|
||||||
|
)
|
||||||
|
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
torch.save(features, cached_features_file)
|
torch.save({"features": features, "dataset": dataset}, cached_features_file)
|
||||||
|
|
||||||
if args.local_rank == 0 and not evaluate:
|
if args.local_rank == 0 and not evaluate:
|
||||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||||
|
|
||||||
# Convert to Tensors and build dataset
|
|
||||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
|
||||||
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
|
||||||
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
|
||||||
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
|
||||||
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
|
||||||
if evaluate:
|
|
||||||
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
|
||||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
|
||||||
all_example_index, all_cls_index, all_p_mask)
|
|
||||||
else:
|
|
||||||
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
|
||||||
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
|
||||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
|
||||||
all_start_positions, all_end_positions,
|
|
||||||
all_cls_index, all_p_mask)
|
|
||||||
|
|
||||||
if output_examples:
|
if output_examples:
|
||||||
return dataset, examples, features
|
return dataset, examples, features
|
||||||
return dataset
|
return dataset
|
||||||
@@ -348,10 +375,6 @@ def main():
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
## Required parameters
|
## Required parameters
|
||||||
parser.add_argument("--train_file", default=None, type=str, required=True,
|
|
||||||
help="SQuAD json for training. E.g., train-v1.1.json")
|
|
||||||
parser.add_argument("--predict_file", default=None, type=str, required=True,
|
|
||||||
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
|
|
||||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||||
@@ -360,6 +383,8 @@ def main():
|
|||||||
help="The output directory where the model checkpoints and predictions will be written.")
|
help="The output directory where the model checkpoints and predictions will be written.")
|
||||||
|
|
||||||
## Other parameters
|
## Other parameters
|
||||||
|
parser.add_argument("--data_dir", default=None, type=str,
|
||||||
|
help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.")
|
||||||
parser.add_argument("--config_name", default="", type=str,
|
parser.add_argument("--config_name", default="", type=str,
|
||||||
help="Pretrained config name or path if not the same as model_name")
|
help="Pretrained config name or path if not the same as model_name")
|
||||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||||
@@ -398,7 +423,7 @@ def main():
|
|||||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||||
help="Weight deay if we apply some.")
|
help="Weight decay if we apply some.")
|
||||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||||
help="Epsilon for Adam optimizer.")
|
help="Epsilon for Adam optimizer.")
|
||||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||||
@@ -444,6 +469,11 @@ def main():
|
|||||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format(
|
||||||
|
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||||
|
str(args.max_seq_length))
|
||||||
|
)
|
||||||
|
|
||||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||||
|
|
||||||
@@ -533,7 +563,7 @@ def main():
|
|||||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||||
|
|
||||||
# Load a trained model and vocabulary that you have fine-tuned
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
model = model_class.from_pretrained(args.output_dir)
|
model = model_class.from_pretrained(args.output_dir, force_download=True)
|
||||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
model.to(args.device)
|
model.to(args.device)
|
||||||
|
|
||||||
@@ -551,7 +581,7 @@ def main():
|
|||||||
for checkpoint in checkpoints:
|
for checkpoint in checkpoints:
|
||||||
# Reload the model
|
# Reload the model
|
||||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
model = model_class.from_pretrained(checkpoint)
|
model = model_class.from_pretrained(checkpoint, force_download=True)
|
||||||
model.to(args.device)
|
model.to(args.device)
|
||||||
|
|
||||||
# Evaluate
|
# Evaluate
|
||||||
|
|||||||
@@ -1,488 +0,0 @@
|
|||||||
# coding=utf-8
|
|
||||||
# Copyright 2019 The HuggingFace Inc. team.
|
|
||||||
# Copyright (c) 2019 The HuggingFace Inc. All rights reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
""" Finetuning seq2seq models for sequence generation."""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import functools
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from tqdm import tqdm, trange
|
|
||||||
import torch
|
|
||||||
from torch.optim import Adam
|
|
||||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
|
||||||
|
|
||||||
from transformers import (
|
|
||||||
AutoTokenizer,
|
|
||||||
BertForMaskedLM,
|
|
||||||
BertConfig,
|
|
||||||
PreTrainedEncoderDecoder,
|
|
||||||
Model2Model,
|
|
||||||
)
|
|
||||||
|
|
||||||
from utils_summarization import (
|
|
||||||
CNNDailyMailDataset,
|
|
||||||
encode_for_summarization,
|
|
||||||
fit_to_block_size,
|
|
||||||
build_lm_labels,
|
|
||||||
build_mask,
|
|
||||||
compute_token_type_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
||||||
|
|
||||||
|
|
||||||
def set_seed(args):
|
|
||||||
random.seed(args.seed)
|
|
||||||
np.random.seed(args.seed)
|
|
||||||
torch.manual_seed(args.seed)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------
|
|
||||||
# Load dataset
|
|
||||||
# ------------
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_cache_examples(args, tokenizer):
|
|
||||||
dataset = CNNDailyMailDataset(tokenizer, data_dir=args.data_dir)
|
|
||||||
return dataset
|
|
||||||
|
|
||||||
|
|
||||||
def collate(data, tokenizer, block_size):
|
|
||||||
""" List of tuple as an input. """
|
|
||||||
# remove the files with empty an story/summary, encode and fit to block
|
|
||||||
data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
|
|
||||||
data = [
|
|
||||||
encode_for_summarization(story, summary, tokenizer) for story, summary in data
|
|
||||||
]
|
|
||||||
data = [
|
|
||||||
(
|
|
||||||
fit_to_block_size(story, block_size, tokenizer.pad_token_id),
|
|
||||||
fit_to_block_size(summary, block_size, tokenizer.pad_token_id),
|
|
||||||
)
|
|
||||||
for story, summary in data
|
|
||||||
]
|
|
||||||
|
|
||||||
stories = torch.tensor([story for story, summary in data])
|
|
||||||
summaries = torch.tensor([summary for story, summary in data])
|
|
||||||
encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
|
|
||||||
encoder_mask = build_mask(stories, tokenizer.pad_token_id)
|
|
||||||
decoder_mask = build_mask(summaries, tokenizer.pad_token_id)
|
|
||||||
lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id)
|
|
||||||
|
|
||||||
return (
|
|
||||||
stories,
|
|
||||||
summaries,
|
|
||||||
encoder_token_type_ids,
|
|
||||||
encoder_mask,
|
|
||||||
decoder_mask,
|
|
||||||
lm_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------
|
|
||||||
# Optimizers
|
|
||||||
# ----------
|
|
||||||
|
|
||||||
|
|
||||||
class BertSumOptimizer(object):
|
|
||||||
""" Specific optimizer for BertSum.
|
|
||||||
|
|
||||||
As described in [1], the authors fine-tune BertSum for abstractive
|
|
||||||
summarization using two Adam Optimizers with different warm-up steps and
|
|
||||||
learning rate. They also use a custom learning rate scheduler.
|
|
||||||
|
|
||||||
[1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
|
|
||||||
arXiv preprint arXiv:1908.08345 (2019).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
|
|
||||||
self.encoder = model.encoder
|
|
||||||
self.decoder = model.decoder
|
|
||||||
self.lr = lr
|
|
||||||
self.warmup_steps = warmup_steps
|
|
||||||
|
|
||||||
self.optimizers = {
|
|
||||||
"encoder": Adam(
|
|
||||||
model.encoder.parameters(),
|
|
||||||
lr=lr["encoder"],
|
|
||||||
betas=(beta_1, beta_2),
|
|
||||||
eps=eps,
|
|
||||||
),
|
|
||||||
"decoder": Adam(
|
|
||||||
model.decoder.parameters(),
|
|
||||||
lr=lr["decoder"],
|
|
||||||
betas=(beta_1, beta_2),
|
|
||||||
eps=eps,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
self._step = 0
|
|
||||||
|
|
||||||
def _update_rate(self, stack):
|
|
||||||
return self.lr[stack] * min(
|
|
||||||
self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-0.5)
|
|
||||||
)
|
|
||||||
|
|
||||||
def zero_grad(self):
|
|
||||||
self.optimizer_decoder.zero_grad()
|
|
||||||
self.optimizer_encoder.zero_grad()
|
|
||||||
|
|
||||||
def step(self):
|
|
||||||
self._step += 1
|
|
||||||
for stack, optimizer in self.optimizers.items():
|
|
||||||
new_rate = self._update_rate(stack)
|
|
||||||
for param_group in optimizer.param_groups:
|
|
||||||
param_group["lr"] = new_rate
|
|
||||||
optimizer.step()
|
|
||||||
|
|
||||||
|
|
||||||
# ------------
|
|
||||||
# Train
|
|
||||||
# ------------
|
|
||||||
|
|
||||||
|
|
||||||
def train(args, model, tokenizer):
|
|
||||||
""" Fine-tune the pretrained model on the corpus. """
|
|
||||||
set_seed(args)
|
|
||||||
|
|
||||||
# Load the data
|
|
||||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
|
||||||
train_dataset = load_and_cache_examples(args, tokenizer)
|
|
||||||
train_sampler = RandomSampler(train_dataset)
|
|
||||||
model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512)
|
|
||||||
train_dataloader = DataLoader(
|
|
||||||
train_dataset,
|
|
||||||
sampler=train_sampler,
|
|
||||||
batch_size=args.train_batch_size,
|
|
||||||
collate_fn=model_collate_fn,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Training schedule
|
|
||||||
if args.max_steps > 0:
|
|
||||||
t_total = args.max_steps
|
|
||||||
args.num_train_epochs = t_total // (
|
|
||||||
len(train_dataloader) // args.gradient_accumulation_steps + 1
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
t_total = (
|
|
||||||
len(train_dataloader)
|
|
||||||
// args.gradient_accumulation_steps
|
|
||||||
* args.num_train_epochs
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prepare the optimizer
|
|
||||||
lr = {"encoder": 0.002, "decoder": 0.2}
|
|
||||||
warmup_steps = {"encoder": 20000, "decoder": 10000}
|
|
||||||
optimizer = BertSumOptimizer(model, lr, warmup_steps)
|
|
||||||
|
|
||||||
# Train
|
|
||||||
logger.info("***** Running training *****")
|
|
||||||
logger.info(" Num examples = %d", len(train_dataset))
|
|
||||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
|
||||||
logger.info(
|
|
||||||
" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
|
||||||
args.train_batch_size * args.gradient_accumulation_steps
|
|
||||||
# * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
|
||||||
)
|
|
||||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
|
||||||
logger.info(" Total optimization steps = %d", t_total)
|
|
||||||
|
|
||||||
model.zero_grad()
|
|
||||||
train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)
|
|
||||||
|
|
||||||
global_step = 0
|
|
||||||
tr_loss = 0.0
|
|
||||||
for _ in train_iterator:
|
|
||||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
|
|
||||||
for step, batch in enumerate(epoch_iterator):
|
|
||||||
source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
|
|
||||||
|
|
||||||
source = source.to(args.device)
|
|
||||||
target = target.to(args.device)
|
|
||||||
encoder_token_type_ids = encoder_token_type_ids.to(args.device)
|
|
||||||
encoder_mask = encoder_mask.to(args.device)
|
|
||||||
decoder_mask = decoder_mask.to(args.device)
|
|
||||||
lm_labels = lm_labels.to(args.device)
|
|
||||||
|
|
||||||
model.train()
|
|
||||||
outputs = model(
|
|
||||||
source,
|
|
||||||
target,
|
|
||||||
encoder_token_type_ids=encoder_token_type_ids,
|
|
||||||
encoder_attention_mask=encoder_mask,
|
|
||||||
decoder_attention_mask=decoder_mask,
|
|
||||||
decoder_lm_labels=lm_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
loss = outputs[0]
|
|
||||||
print(loss)
|
|
||||||
if args.gradient_accumulation_steps > 1:
|
|
||||||
loss /= args.gradient_accumulation_steps
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
|
|
||||||
tr_loss += loss.item()
|
|
||||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
|
||||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
|
||||||
optimizer.step()
|
|
||||||
model.zero_grad()
|
|
||||||
global_step += 1
|
|
||||||
|
|
||||||
if args.max_steps > 0 and global_step > args.max_steps:
|
|
||||||
epoch_iterator.close()
|
|
||||||
break
|
|
||||||
|
|
||||||
if args.max_steps > 0 and global_step > args.max_steps:
|
|
||||||
train_iterator.close()
|
|
||||||
break
|
|
||||||
|
|
||||||
return global_step, tr_loss / global_step
|
|
||||||
|
|
||||||
|
|
||||||
# ------------
|
|
||||||
# Train
|
|
||||||
# ------------
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(args, model, tokenizer, prefix=""):
|
|
||||||
set_seed(args)
|
|
||||||
|
|
||||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
|
||||||
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
|
|
||||||
eval_sampler = SequentialSampler(eval_dataset)
|
|
||||||
eval_dataloader = DataLoader(
|
|
||||||
eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
|
||||||
logger.info(" Num examples = %d", len(eval_dataset))
|
|
||||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
|
||||||
eval_loss = 0.0
|
|
||||||
nb_eval_steps = 0
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
|
||||||
source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
|
|
||||||
|
|
||||||
source = source.to(args.device)
|
|
||||||
target = target.to(args.device)
|
|
||||||
encoder_token_type_ids = encoder_token_type_ids.to(args.device)
|
|
||||||
encoder_mask = encoder_mask.to(args.device)
|
|
||||||
decoder_mask = decoder_mask.to(args.device)
|
|
||||||
lm_labels = lm_labels.to(args.device)
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
outputs = model(
|
|
||||||
source,
|
|
||||||
target,
|
|
||||||
encoder_token_type_ids=encoder_token_type_ids,
|
|
||||||
encoder_attention_mask=encoder_mask,
|
|
||||||
decoder_attention_mask=decoder_mask,
|
|
||||||
decoder_lm_labels=lm_labels,
|
|
||||||
)
|
|
||||||
lm_loss = outputs[0]
|
|
||||||
eval_loss += lm_loss.mean().item()
|
|
||||||
nb_eval_steps += 1
|
|
||||||
|
|
||||||
eval_loss = eval_loss / nb_eval_steps
|
|
||||||
perplexity = torch.exp(torch.tensor(eval_loss))
|
|
||||||
|
|
||||||
result = {"perplexity": perplexity}
|
|
||||||
|
|
||||||
# Save the evaluation's results
|
|
||||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
|
||||||
if not os.path.exists(args.output_dir):
|
|
||||||
os.makedirs(args.output_dir)
|
|
||||||
|
|
||||||
with open(output_eval_file, "w") as writer:
|
|
||||||
logger.info("***** Eval results {} *****".format(prefix))
|
|
||||||
for key in sorted(result.keys()):
|
|
||||||
logger.info(" %s = %s", key, str(result[key]))
|
|
||||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
|
|
||||||
# Required parameters
|
|
||||||
parser.add_argument(
|
|
||||||
"--data_dir",
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="The input training data file (a text file).",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output_dir",
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="The output directory where the model predictions and checkpoints will be written.",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Optional parameters
|
|
||||||
parser.add_argument(
|
|
||||||
"--gradient_accumulation_steps",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--do_evaluate",
|
|
||||||
type=bool,
|
|
||||||
default=False,
|
|
||||||
help="Run model evaluation on out-of-sample data.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--do_train", type=bool, default=False, help="Run training.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--do_overwrite_output_dir",
|
|
||||||
type=bool,
|
|
||||||
default=False,
|
|
||||||
help="Whether to overwrite the output dir.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--model_name_or_path",
|
|
||||||
default="bert-base-cased",
|
|
||||||
type=str,
|
|
||||||
help="The model checkpoint to initialize the encoder and decoder's weights with.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--model_type",
|
|
||||||
default="bert",
|
|
||||||
type=str,
|
|
||||||
help="The decoder architecture to be fine-tuned.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max_steps",
|
|
||||||
default=-1,
|
|
||||||
type=int,
|
|
||||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--to_cpu", default=False, type=bool, help="Whether to force training on CPU."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num_train_epochs",
|
|
||||||
default=10,
|
|
||||||
type=int,
|
|
||||||
help="Total number of training epochs to perform.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--per_gpu_train_batch_size",
|
|
||||||
default=4,
|
|
||||||
type=int,
|
|
||||||
help="Batch size per GPU/CPU for training.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--seed", default=42, type=int)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if (
|
|
||||||
os.path.exists(args.output_dir)
|
|
||||||
and os.listdir(args.output_dir)
|
|
||||||
and args.do_train
|
|
||||||
and not args.do_overwrite_output_dir
|
|
||||||
):
|
|
||||||
raise ValueError(
|
|
||||||
"Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
|
|
||||||
args.output_dir
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Set up training device
|
|
||||||
if args.to_cpu or not torch.cuda.is_available():
|
|
||||||
args.device = torch.device("cpu")
|
|
||||||
args.n_gpu = 0
|
|
||||||
else:
|
|
||||||
args.device = torch.device("cuda")
|
|
||||||
args.n_gpu = torch.cuda.device_count()
|
|
||||||
|
|
||||||
# Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
|
|
||||||
config = BertConfig.from_pretrained(args.model_name_or_path)
|
|
||||||
decoder_model = BertForMaskedLM(config)
|
|
||||||
model = Model2Model.from_pretrained(
|
|
||||||
args.model_name_or_path, decoder_model=decoder_model
|
|
||||||
)
|
|
||||||
|
|
||||||
# Setup logging
|
|
||||||
logging.basicConfig(
|
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
|
||||||
datefmt="%m/%d/%Y %H:%M:%S",
|
|
||||||
level=logging.INFO,
|
|
||||||
)
|
|
||||||
logger.warning(
|
|
||||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
|
||||||
0,
|
|
||||||
args.device,
|
|
||||||
args.n_gpu,
|
|
||||||
False,
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("Training/evaluation parameters %s", args)
|
|
||||||
|
|
||||||
# Train the model
|
|
||||||
model.to(args.device)
|
|
||||||
if args.do_train:
|
|
||||||
global_step, tr_loss = train(args, model, tokenizer)
|
|
||||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
|
||||||
|
|
||||||
if not os.path.exists(args.output_dir):
|
|
||||||
os.makedirs(args.output_dir)
|
|
||||||
|
|
||||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
|
||||||
|
|
||||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
|
||||||
# They can then be reloaded using `from_pretrained()`
|
|
||||||
model_to_save = (
|
|
||||||
model.module if hasattr(model, "module") else model
|
|
||||||
) # Take care of distributed/parallel training
|
|
||||||
model_to_save.save_pretrained(args.output_dir)
|
|
||||||
tokenizer.save_pretrained(args.output_dir)
|
|
||||||
torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))
|
|
||||||
|
|
||||||
# Evaluate the model
|
|
||||||
results = {}
|
|
||||||
if args.do_evaluate:
|
|
||||||
checkpoints = []
|
|
||||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
|
||||||
for checkpoint in checkpoints:
|
|
||||||
encoder_checkpoint = os.path.join(checkpoint, "encoder")
|
|
||||||
decoder_checkpoint = os.path.join(checkpoint, "decoder")
|
|
||||||
model = PreTrainedEncoderDecoder.from_pretrained(
|
|
||||||
encoder_checkpoint, decoder_checkpoint
|
|
||||||
)
|
|
||||||
model.to(args.device)
|
|
||||||
results = "placeholder"
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -73,6 +73,8 @@ model.save_pretrained('./save/')
|
|||||||
|
|
||||||
if TASK == "mrpc":
|
if TASK == "mrpc":
|
||||||
# Load the TensorFlow model in PyTorch for inspection
|
# Load the TensorFlow model in PyTorch for inspection
|
||||||
|
# This is to demo the interoperability between the two frameworks, you don't have to
|
||||||
|
# do this in real life (you can run the inference on the TF model).
|
||||||
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
|
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
|
||||||
|
|
||||||
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
|
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
|
||||||
|
|||||||
615
examples/run_tf_ner.py
Normal file
615
examples/run_tf_ner.py
Normal file
@@ -0,0 +1,615 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
import glob
|
||||||
|
import re
|
||||||
|
import tensorflow as tf
|
||||||
|
import collections
|
||||||
|
import numpy as np
|
||||||
|
from seqeval import metrics
|
||||||
|
import _pickle as pickle
|
||||||
|
from absl import logging
|
||||||
|
from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification
|
||||||
|
from transformers import RobertaConfig, RobertaTokenizer, TFRobertaForTokenClassification
|
||||||
|
from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||||
|
from transformers import create_optimizer, GradientAccumulator
|
||||||
|
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
||||||
|
from fastprogress import master_bar, progress_bar
|
||||||
|
from absl import flags
|
||||||
|
from absl import app
|
||||||
|
|
||||||
|
|
||||||
|
ALL_MODELS = sum(
|
||||||
|
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
|
||||||
|
())
|
||||||
|
|
||||||
|
MODEL_CLASSES = {
|
||||||
|
"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
|
||||||
|
"roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
|
||||||
|
"distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"data_dir", None,
|
||||||
|
"The input data dir. Should contain the .conll files (or other data files) "
|
||||||
|
"for the task.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"model_type", None,
|
||||||
|
"Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"model_name_or_path", None,
|
||||||
|
"Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"output_dir", None,
|
||||||
|
"The output directory where the model checkpoints will be written.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"labels", "",
|
||||||
|
"Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"config_name", "",
|
||||||
|
"Pretrained config name or path if not the same as model_name")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"tokenizer_name", "",
|
||||||
|
"Pretrained tokenizer name or path if not the same as model_name")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"cache_dir", "",
|
||||||
|
"Where do you want to store the pre-trained models downloaded from s3")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"max_seq_length", 128,
|
||||||
|
"The maximum total input sentence length after tokenization. "
|
||||||
|
"Sequences longer than this will be truncated, sequences shorter "
|
||||||
|
"will be padded.")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"tpu", None,
|
||||||
|
"The Cloud TPU to use for training. This should be either the name "
|
||||||
|
"used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
|
||||||
|
"url.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"num_tpu_cores", 8,
|
||||||
|
"Total number of TPU cores to use.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_train", False,
|
||||||
|
"Whether to run training.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_eval", False,
|
||||||
|
"Whether to run eval on the dev set.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_predict", False,
|
||||||
|
"Whether to run predictions on the test set.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"evaluate_during_training", False,
|
||||||
|
"Whether to run evaluation during training at each logging step.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"do_lower_case", False,
|
||||||
|
"Set this flag if you are using an uncased model.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"per_device_train_batch_size", 8,
|
||||||
|
"Batch size per GPU/CPU/TPU for training.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"per_device_eval_batch_size", 8,
|
||||||
|
"Batch size per GPU/CPU/TPU for evaluation.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"gradient_accumulation_steps", 1,
|
||||||
|
"Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"learning_rate", 5e-5,
|
||||||
|
"The initial learning rate for Adam.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"weight_decay", 0.0,
|
||||||
|
"Weight decay if we apply some.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"adam_epsilon", 1e-8,
|
||||||
|
"Epsilon for Adam optimizer.")
|
||||||
|
|
||||||
|
flags.DEFINE_float(
|
||||||
|
"max_grad_norm", 1.0,
|
||||||
|
"Max gradient norm.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"num_train_epochs", 3,
|
||||||
|
"Total number of training epochs to perform.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"max_steps", -1,
|
||||||
|
"If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"warmup_steps", 0,
|
||||||
|
"Linear warmup over warmup_steps.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"logging_steps", 50,
|
||||||
|
"Log every X updates steps.")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"save_steps", 50,
|
||||||
|
"Save checkpoint every X updates steps.")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"eval_all_checkpoints", False,
|
||||||
|
"Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"no_cuda", False,
|
||||||
|
"Avoid using CUDA when available")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"overwrite_output_dir", False,
|
||||||
|
"Overwrite the content of the output directory")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"overwrite_cache", False,
|
||||||
|
"Overwrite the cached training and evaluation sets")
|
||||||
|
|
||||||
|
flags.DEFINE_integer(
|
||||||
|
"seed", 42,
|
||||||
|
"random seed for initialization")
|
||||||
|
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"fp16", False,
|
||||||
|
"Whether to use 16-bit (mixed) precision instead of 32-bit")
|
||||||
|
|
||||||
|
flags.DEFINE_string(
|
||||||
|
"gpus", "0",
|
||||||
|
"Comma separated list of gpus devices. If only one, switch to single "
|
||||||
|
"gpu strategy, if None takes all the gpus available.")
|
||||||
|
|
||||||
|
|
||||||
|
def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
|
||||||
|
if args['max_steps'] > 0:
|
||||||
|
num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
|
||||||
|
args['num_train_epochs'] = 1
|
||||||
|
else:
|
||||||
|
num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
|
||||||
|
|
||||||
|
writer = tf.summary.create_file_writer("/tmp/mylogs")
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||||
|
optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
|
||||||
|
|
||||||
|
if args['fp16']:
|
||||||
|
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
|
||||||
|
|
||||||
|
loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
|
||||||
|
gradient_accumulator = GradientAccumulator()
|
||||||
|
|
||||||
|
logging.info("***** Running training *****")
|
||||||
|
logging.info(" Num examples = %d", num_train_examples)
|
||||||
|
logging.info(" Num Epochs = %d", args['num_train_epochs'])
|
||||||
|
logging.info(" Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
|
||||||
|
logging.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||||
|
train_batch_size * args['gradient_accumulation_steps'])
|
||||||
|
logging.info(" Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
|
||||||
|
logging.info(" Total training steps = %d", num_train_steps)
|
||||||
|
|
||||||
|
model.summary()
|
||||||
|
|
||||||
|
@tf.function
|
||||||
|
def apply_gradients():
|
||||||
|
grads_and_vars = []
|
||||||
|
|
||||||
|
for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
|
||||||
|
if gradient is not None:
|
||||||
|
scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
|
||||||
|
grads_and_vars.append((scaled_gradient, variable))
|
||||||
|
else:
|
||||||
|
grads_and_vars.append((gradient, variable))
|
||||||
|
|
||||||
|
optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
|
||||||
|
gradient_accumulator.reset()
|
||||||
|
|
||||||
|
@tf.function
|
||||||
|
def train_step(train_features, train_labels):
|
||||||
|
def step_fn(train_features, train_labels):
|
||||||
|
inputs = {'attention_mask': train_features['input_mask'], 'training': True}
|
||||||
|
|
||||||
|
if args['model_type'] != "distilbert":
|
||||||
|
inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||||
|
|
||||||
|
with tf.GradientTape() as tape:
|
||||||
|
logits = model(train_features['input_ids'], **inputs)[0]
|
||||||
|
logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||||
|
active_loss = tf.reshape(train_features['input_mask'], (-1,))
|
||||||
|
active_logits = tf.boolean_mask(logits, active_loss)
|
||||||
|
train_labels = tf.reshape(train_labels, (-1,))
|
||||||
|
active_labels = tf.boolean_mask(train_labels, active_loss)
|
||||||
|
cross_entropy = loss_fct(active_labels, active_logits)
|
||||||
|
loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
|
||||||
|
grads = tape.gradient(loss, model.trainable_variables)
|
||||||
|
|
||||||
|
gradient_accumulator(grads)
|
||||||
|
|
||||||
|
return cross_entropy
|
||||||
|
|
||||||
|
per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
|
||||||
|
mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
|
||||||
|
|
||||||
|
return mean_loss
|
||||||
|
|
||||||
|
current_time = datetime.datetime.now()
|
||||||
|
train_iterator = master_bar(range(args['num_train_epochs']))
|
||||||
|
global_step = 0
|
||||||
|
logging_loss = 0.0
|
||||||
|
|
||||||
|
for epoch in train_iterator:
|
||||||
|
epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
|
||||||
|
step = 1
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
for train_features, train_labels in epoch_iterator:
|
||||||
|
loss = train_step(train_features, train_labels)
|
||||||
|
|
||||||
|
if step % args['gradient_accumulation_steps'] == 0:
|
||||||
|
strategy.experimental_run_v2(apply_gradients)
|
||||||
|
|
||||||
|
loss_metric(loss)
|
||||||
|
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
|
||||||
|
# Log metrics
|
||||||
|
if args['n_device'] == 1 and args['evaluate_during_training']: # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
|
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
logging.info("Eval at step " + str(global_step) + "\n" + report)
|
||||||
|
logging.info("eval_loss: " + str(eval_loss))
|
||||||
|
|
||||||
|
precision = metrics.precision_score(y_true, y_pred)
|
||||||
|
recall = metrics.recall_score(y_true, y_pred)
|
||||||
|
f1 = metrics.f1_score(y_true, y_pred)
|
||||||
|
|
||||||
|
with writer.as_default():
|
||||||
|
tf.summary.scalar("eval_loss", eval_loss, global_step)
|
||||||
|
tf.summary.scalar("precision", precision, global_step)
|
||||||
|
tf.summary.scalar("recall", recall, global_step)
|
||||||
|
tf.summary.scalar("f1", f1, global_step)
|
||||||
|
|
||||||
|
lr = optimizer.learning_rate
|
||||||
|
learning_rate = lr(step)
|
||||||
|
|
||||||
|
with writer.as_default():
|
||||||
|
tf.summary.scalar("lr", learning_rate, global_step)
|
||||||
|
tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
|
||||||
|
|
||||||
|
logging_loss = loss_metric.result()
|
||||||
|
|
||||||
|
with writer.as_default():
|
||||||
|
tf.summary.scalar("loss", loss_metric.result(), step=step)
|
||||||
|
|
||||||
|
if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
|
||||||
|
# Save model checkpoint
|
||||||
|
output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
|
||||||
|
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
model.save_pretrained(output_dir)
|
||||||
|
logging.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
|
||||||
|
train_iterator.child.comment = f'loss : {loss_metric.result()}'
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
|
||||||
|
|
||||||
|
loss_metric.reset_states()
|
||||||
|
|
||||||
|
logging.info(" Training took time = {}".format(datetime.datetime.now() - current_time))
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
|
||||||
|
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||||
|
eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
|
||||||
|
eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
|
||||||
|
preds = None
|
||||||
|
num_eval_steps = math.ceil(size / eval_batch_size)
|
||||||
|
master = master_bar(range(1))
|
||||||
|
eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
|
||||||
|
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||||
|
loss = 0.0
|
||||||
|
|
||||||
|
logging.info("***** Running evaluation *****")
|
||||||
|
logging.info(" Num examples = %d", size)
|
||||||
|
logging.info(" Batch size = %d", eval_batch_size)
|
||||||
|
|
||||||
|
for eval_features, eval_labels in eval_iterator:
|
||||||
|
inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
|
||||||
|
|
||||||
|
if args['model_type'] != "distilbert":
|
||||||
|
inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
logits = model(eval_features['input_ids'], **inputs)[0]
|
||||||
|
tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||||
|
active_loss = tf.reshape(eval_features['input_mask'], (-1,))
|
||||||
|
active_logits = tf.boolean_mask(tmp_logits, active_loss)
|
||||||
|
tmp_eval_labels = tf.reshape(eval_labels, (-1,))
|
||||||
|
active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
|
||||||
|
cross_entropy = loss_fct(active_labels, active_logits)
|
||||||
|
loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
|
||||||
|
|
||||||
|
if preds is None:
|
||||||
|
preds = logits.numpy()
|
||||||
|
label_ids = eval_labels.numpy()
|
||||||
|
else:
|
||||||
|
preds = np.append(preds, logits.numpy(), axis=0)
|
||||||
|
label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
|
||||||
|
|
||||||
|
preds = np.argmax(preds, axis=2)
|
||||||
|
y_pred = [[] for _ in range(label_ids.shape[0])]
|
||||||
|
y_true = [[] for _ in range(label_ids.shape[0])]
|
||||||
|
loss = loss / num_eval_steps
|
||||||
|
|
||||||
|
for i in range(label_ids.shape[0]):
|
||||||
|
for j in range(label_ids.shape[1]):
|
||||||
|
if label_ids[i, j] != pad_token_label_id:
|
||||||
|
y_pred[i].append(labels[preds[i, j] - 1])
|
||||||
|
y_true[i].append(labels[label_ids[i, j] - 1])
|
||||||
|
|
||||||
|
return y_true, y_pred, loss.numpy()
|
||||||
|
|
||||||
|
|
||||||
|
def load_cache(cached_file, max_seq_length):
|
||||||
|
name_to_features = {
|
||||||
|
"input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
"input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
"segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
"label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _decode_record(record):
|
||||||
|
example = tf.io.parse_single_example(record, name_to_features)
|
||||||
|
features = {}
|
||||||
|
features['input_ids'] = example['input_ids']
|
||||||
|
features['input_mask'] = example['input_mask']
|
||||||
|
features['segment_ids'] = example['segment_ids']
|
||||||
|
|
||||||
|
return features, example['label_ids']
|
||||||
|
|
||||||
|
d = tf.data.TFRecordDataset(cached_file)
|
||||||
|
d = d.map(_decode_record, num_parallel_calls=4)
|
||||||
|
count = d.reduce(0, lambda x, _: x + 1)
|
||||||
|
|
||||||
|
return d, count.numpy()
|
||||||
|
|
||||||
|
|
||||||
|
def save_cache(features, cached_features_file):
|
||||||
|
writer = tf.io.TFRecordWriter(cached_features_file)
|
||||||
|
|
||||||
|
for (ex_index, feature) in enumerate(features):
|
||||||
|
if ex_index % 5000 == 0:
|
||||||
|
logging.info("Writing example %d of %d" % (ex_index, len(features)))
|
||||||
|
|
||||||
|
def create_int_feature(values):
|
||||||
|
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
||||||
|
return f
|
||||||
|
|
||||||
|
record_feature = collections.OrderedDict()
|
||||||
|
record_feature["input_ids"] = create_int_feature(feature.input_ids)
|
||||||
|
record_feature["input_mask"] = create_int_feature(feature.input_mask)
|
||||||
|
record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
|
||||||
|
record_feature["label_ids"] = create_int_feature(feature.label_ids)
|
||||||
|
|
||||||
|
tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
|
||||||
|
|
||||||
|
writer.write(tf_example.SerializeToString())
|
||||||
|
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
|
||||||
|
drop_remainder = True if args['tpu'] or mode == 'train' else False
|
||||||
|
|
||||||
|
# Load data features from cache or dataset file
|
||||||
|
cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
|
||||||
|
list(filter(None, args['model_name_or_path'].split("/"))).pop(),
|
||||||
|
str(args['max_seq_length'])))
|
||||||
|
if os.path.exists(cached_features_file) and not args['overwrite_cache']:
|
||||||
|
logging.info("Loading features from cached file %s", cached_features_file)
|
||||||
|
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||||
|
else:
|
||||||
|
logging.info("Creating features from dataset file at %s", args['data_dir'])
|
||||||
|
examples = read_examples_from_file(args['data_dir'], mode)
|
||||||
|
features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
|
||||||
|
cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
|
||||||
|
# xlnet has a cls token at the end
|
||||||
|
cls_token=tokenizer.cls_token,
|
||||||
|
cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
|
||||||
|
sep_token=tokenizer.sep_token,
|
||||||
|
sep_token_extra=bool(args['model_type'] in ["roberta"]),
|
||||||
|
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||||
|
pad_on_left=bool(args['model_type'] in ["xlnet"]),
|
||||||
|
# pad on the left for xlnet
|
||||||
|
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||||
|
pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
|
||||||
|
pad_token_label_id=pad_token_label_id
|
||||||
|
)
|
||||||
|
logging.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
save_cache(features, cached_features_file)
|
||||||
|
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||||
|
|
||||||
|
if mode == 'train':
|
||||||
|
dataset = dataset.repeat()
|
||||||
|
dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
|
||||||
|
|
||||||
|
dataset = dataset.batch(batch_size, drop_remainder)
|
||||||
|
dataset = dataset.prefetch(buffer_size=batch_size)
|
||||||
|
|
||||||
|
return dataset, size
|
||||||
|
|
||||||
|
|
||||||
|
def main(_):
|
||||||
|
logging.set_verbosity(logging.INFO)
|
||||||
|
args = flags.FLAGS.flag_values_dict()
|
||||||
|
|
||||||
|
if os.path.exists(args['output_dir']) and os.listdir(
|
||||||
|
args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
|
||||||
|
raise ValueError(
|
||||||
|
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||||
|
args['output_dir']))
|
||||||
|
|
||||||
|
if args['fp16']:
|
||||||
|
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
|
||||||
|
|
||||||
|
if args['tpu']:
|
||||||
|
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
|
||||||
|
tf.config.experimental_connect_to_cluster(resolver)
|
||||||
|
tf.tpu.experimental.initialize_tpu_system(resolver)
|
||||||
|
strategy = tf.distribute.experimental.TPUStrategy(resolver)
|
||||||
|
args['n_device'] = args['num_tpu_cores']
|
||||||
|
elif len(args['gpus'].split(',')) > 1:
|
||||||
|
args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||||
|
strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||||
|
elif args['no_cuda']:
|
||||||
|
args['n_device'] = 1
|
||||||
|
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
|
||||||
|
else:
|
||||||
|
args['n_device'] = len(args['gpus'].split(','))
|
||||||
|
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
|
||||||
|
|
||||||
|
logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
|
||||||
|
args['n_device'], bool(args['n_device'] > 1), args['fp16'])
|
||||||
|
|
||||||
|
labels = get_labels(args['labels'])
|
||||||
|
num_labels = len(labels) + 1
|
||||||
|
pad_token_label_id = 0
|
||||||
|
config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
|
||||||
|
config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
|
||||||
|
num_labels=num_labels,
|
||||||
|
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||||
|
|
||||||
|
logging.info("Training/evaluation parameters %s", args)
|
||||||
|
|
||||||
|
# Training
|
||||||
|
if args['do_train']:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
|
||||||
|
do_lower_case=args['do_lower_case'],
|
||||||
|
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
model = model_class.from_pretrained(args['model_name_or_path'],
|
||||||
|
from_pt=bool(".bin" in args['model_name_or_path']),
|
||||||
|
config=config,
|
||||||
|
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||||
|
model.layers[-1].activation = tf.keras.activations.softmax
|
||||||
|
|
||||||
|
train_batch_size = args['per_device_train_batch_size'] * args['n_device']
|
||||||
|
train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
|
||||||
|
train_dataset = strategy.experimental_distribute_dataset(train_dataset)
|
||||||
|
train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
|
||||||
|
|
||||||
|
if not os.path.exists(args['output_dir']):
|
||||||
|
os.makedirs(args['output_dir'])
|
||||||
|
|
||||||
|
logging.info("Saving model to %s", args['output_dir'])
|
||||||
|
|
||||||
|
model.save_pretrained(args['output_dir'])
|
||||||
|
tokenizer.save_pretrained(args['output_dir'])
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
if args['do_eval']:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||||
|
checkpoints = []
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if args['eval_all_checkpoints']:
|
||||||
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
|
||||||
|
|
||||||
|
logging.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
|
||||||
|
if len(checkpoints) == 0:
|
||||||
|
checkpoints.append(args['output_dir'])
|
||||||
|
|
||||||
|
for checkpoint in checkpoints:
|
||||||
|
global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
model = model_class.from_pretrained(checkpoint)
|
||||||
|
|
||||||
|
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
if global_step:
|
||||||
|
results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
|
||||||
|
|
||||||
|
output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
|
||||||
|
|
||||||
|
with tf.io.gfile.GFile(output_eval_file, "w") as writer:
|
||||||
|
for res in results:
|
||||||
|
for key, val in res.items():
|
||||||
|
if "loss" in key:
|
||||||
|
logging.info(key + " = " + str(val))
|
||||||
|
writer.write(key + " = " + str(val))
|
||||||
|
writer.write("\n")
|
||||||
|
else:
|
||||||
|
logging.info(key)
|
||||||
|
logging.info("\n" + report)
|
||||||
|
writer.write(key + "\n")
|
||||||
|
writer.write(report)
|
||||||
|
writer.write("\n")
|
||||||
|
|
||||||
|
if args['do_predict']:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||||
|
model = model_class.from_pretrained(args['output_dir'])
|
||||||
|
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||||
|
predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
|
||||||
|
y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
|
||||||
|
output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
|
||||||
|
output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
|
||||||
|
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||||
|
|
||||||
|
logging.info("\n" + report)
|
||||||
|
|
||||||
|
writer.write(report)
|
||||||
|
writer.write("\n\nloss = " + str(pred_loss))
|
||||||
|
|
||||||
|
with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
|
||||||
|
with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
|
||||||
|
example_id = 0
|
||||||
|
|
||||||
|
for line in f:
|
||||||
|
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||||
|
writer.write(line)
|
||||||
|
|
||||||
|
if not y_pred[example_id]:
|
||||||
|
example_id += 1
|
||||||
|
elif y_pred[example_id]:
|
||||||
|
output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
|
||||||
|
writer.write(output_line)
|
||||||
|
else:
|
||||||
|
logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
flags.mark_flag_as_required("data_dir")
|
||||||
|
flags.mark_flag_as_required("output_dir")
|
||||||
|
flags.mark_flag_as_required("model_name_or_path")
|
||||||
|
flags.mark_flag_as_required("model_type")
|
||||||
|
app.run(main)
|
||||||
515
examples/run_xnli.py
Normal file
515
examples/run_xnli.py
Normal file
@@ -0,0 +1,515 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
|
||||||
|
Adapted from `examples/run_glue.py`"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||||
|
TensorDataset)
|
||||||
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
|
try:
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
except:
|
||||||
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
|
from transformers import (WEIGHTS_NAME,
|
||||||
|
BertConfig, BertForSequenceClassification, BertTokenizer,
|
||||||
|
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
||||||
|
DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||||
|
|
||||||
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
|
from transformers import xnli_compute_metrics as compute_metrics
|
||||||
|
from transformers import xnli_output_modes as output_modes
|
||||||
|
from transformers import xnli_processors as processors
|
||||||
|
|
||||||
|
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ())
|
||||||
|
|
||||||
|
MODEL_CLASSES = {
|
||||||
|
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||||
|
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||||
|
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def set_seed(args):
|
||||||
|
random.seed(args.seed)
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
if args.n_gpu > 0:
|
||||||
|
torch.cuda.manual_seed_all(args.seed)
|
||||||
|
|
||||||
|
|
||||||
|
def train(args, train_dataset, model, tokenizer):
|
||||||
|
""" Train the model """
|
||||||
|
if args.local_rank in [-1, 0]:
|
||||||
|
tb_writer = SummaryWriter()
|
||||||
|
|
||||||
|
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||||
|
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||||
|
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||||
|
|
||||||
|
if args.max_steps > 0:
|
||||||
|
t_total = args.max_steps
|
||||||
|
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||||
|
else:
|
||||||
|
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||||
|
|
||||||
|
# Prepare optimizer and schedule (linear warmup and decay)
|
||||||
|
no_decay = ['bias', 'LayerNorm.weight']
|
||||||
|
optimizer_grouped_parameters = [
|
||||||
|
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||||
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
|
]
|
||||||
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
|
if args.fp16:
|
||||||
|
try:
|
||||||
|
from apex import amp
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||||
|
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||||
|
|
||||||
|
# multi-gpu training (should be after apex fp16 initialization)
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
|
# Distributed training (should be after apex fp16 initialization)
|
||||||
|
if args.local_rank != -1:
|
||||||
|
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||||
|
output_device=args.local_rank,
|
||||||
|
find_unused_parameters=True)
|
||||||
|
|
||||||
|
# Train!
|
||||||
|
logger.info("***** Running training *****")
|
||||||
|
logger.info(" Num examples = %d", len(train_dataset))
|
||||||
|
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||||
|
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||||
|
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||||
|
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||||
|
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||||
|
logger.info(" Total optimization steps = %d", t_total)
|
||||||
|
|
||||||
|
global_step = 0
|
||||||
|
tr_loss, logging_loss = 0.0, 0.0
|
||||||
|
model.zero_grad()
|
||||||
|
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||||
|
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||||
|
for _ in train_iterator:
|
||||||
|
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||||
|
for step, batch in enumerate(epoch_iterator):
|
||||||
|
model.train()
|
||||||
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
|
inputs = {'input_ids': batch[0],
|
||||||
|
'attention_mask': batch[1],
|
||||||
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids
|
||||||
|
outputs = model(**inputs)
|
||||||
|
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||||
|
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||||
|
if args.gradient_accumulation_steps > 1:
|
||||||
|
loss = loss / args.gradient_accumulation_steps
|
||||||
|
|
||||||
|
if args.fp16:
|
||||||
|
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||||
|
scaled_loss.backward()
|
||||||
|
else:
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
tr_loss += loss.item()
|
||||||
|
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||||
|
if args.fp16:
|
||||||
|
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||||
|
else:
|
||||||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||||
|
|
||||||
|
optimizer.step()
|
||||||
|
scheduler.step() # Update learning rate schedule
|
||||||
|
model.zero_grad()
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||||
|
# Log metrics
|
||||||
|
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||||
|
results = evaluate(args, model, tokenizer)
|
||||||
|
for key, value in results.items():
|
||||||
|
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||||
|
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||||
|
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||||
|
logging_loss = tr_loss
|
||||||
|
|
||||||
|
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||||
|
# Save model checkpoint
|
||||||
|
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
|
model_to_save.save_pretrained(output_dir)
|
||||||
|
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||||
|
logger.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
|
||||||
|
if args.max_steps > 0 and global_step > args.max_steps:
|
||||||
|
epoch_iterator.close()
|
||||||
|
break
|
||||||
|
if args.max_steps > 0 and global_step > args.max_steps:
|
||||||
|
train_iterator.close()
|
||||||
|
break
|
||||||
|
|
||||||
|
if args.local_rank in [-1, 0]:
|
||||||
|
tb_writer.close()
|
||||||
|
|
||||||
|
return global_step, tr_loss / global_step
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args, model, tokenizer, prefix=""):
|
||||||
|
eval_task_names = (args.task_name,)
|
||||||
|
eval_outputs_dirs = (args.output_dir,)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
|
||||||
|
eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
|
||||||
|
|
||||||
|
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||||
|
os.makedirs(eval_output_dir)
|
||||||
|
|
||||||
|
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||||
|
# Note that DistributedSampler samples randomly
|
||||||
|
eval_sampler = SequentialSampler(eval_dataset)
|
||||||
|
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
|
# multi-gpu eval
|
||||||
|
if args.n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
|
# Eval!
|
||||||
|
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||||
|
logger.info(" Num examples = %d", len(eval_dataset))
|
||||||
|
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||||
|
eval_loss = 0.0
|
||||||
|
nb_eval_steps = 0
|
||||||
|
preds = None
|
||||||
|
out_label_ids = None
|
||||||
|
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||||
|
model.eval()
|
||||||
|
batch = tuple(t.to(args.device) for t in batch)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
inputs = {'input_ids': batch[0],
|
||||||
|
'attention_mask': batch[1],
|
||||||
|
'labels': batch[3]}
|
||||||
|
if args.model_type != 'distilbert':
|
||||||
|
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids
|
||||||
|
outputs = model(**inputs)
|
||||||
|
tmp_eval_loss, logits = outputs[:2]
|
||||||
|
|
||||||
|
eval_loss += tmp_eval_loss.mean().item()
|
||||||
|
nb_eval_steps += 1
|
||||||
|
if preds is None:
|
||||||
|
preds = logits.detach().cpu().numpy()
|
||||||
|
out_label_ids = inputs['labels'].detach().cpu().numpy()
|
||||||
|
else:
|
||||||
|
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
|
||||||
|
out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
|
||||||
|
|
||||||
|
eval_loss = eval_loss / nb_eval_steps
|
||||||
|
if args.output_mode == "classification":
|
||||||
|
preds = np.argmax(preds, axis=1)
|
||||||
|
else:
|
||||||
|
raise ValueError('No other `output_mode` for XNLI.')
|
||||||
|
result = compute_metrics(eval_task, preds, out_label_ids)
|
||||||
|
results.update(result)
|
||||||
|
|
||||||
|
output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
logger.info("***** Eval results {} *****".format(prefix))
|
||||||
|
for key in sorted(result.keys()):
|
||||||
|
logger.info(" %s = %s", key, str(result[key]))
|
||||||
|
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||||
|
if args.local_rank not in [-1, 0] and not evaluate:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||||
|
|
||||||
|
processor = processors[task](language=args.language, train_language=args.train_language)
|
||||||
|
output_mode = output_modes[task]
|
||||||
|
# Load data features from cache or dataset file
|
||||||
|
cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
|
||||||
|
'test' if evaluate else 'train',
|
||||||
|
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||||
|
str(args.max_seq_length),
|
||||||
|
str(task),
|
||||||
|
str(args.train_language if (not evaluate and args.train_language is not None) else args.language)))
|
||||||
|
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||||
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
|
features = torch.load(cached_features_file)
|
||||||
|
else:
|
||||||
|
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||||
|
label_list = processor.get_labels()
|
||||||
|
examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||||
|
features = convert_examples_to_features(examples,
|
||||||
|
tokenizer,
|
||||||
|
label_list=label_list,
|
||||||
|
max_length=args.max_seq_length,
|
||||||
|
output_mode=output_mode,
|
||||||
|
pad_on_left=False,
|
||||||
|
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||||
|
pad_token_segment_id=0,
|
||||||
|
)
|
||||||
|
if args.local_rank in [-1, 0]:
|
||||||
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
|
torch.save(features, cached_features_file)
|
||||||
|
|
||||||
|
if args.local_rank == 0 and not evaluate:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||||
|
|
||||||
|
# Convert to Tensors and build dataset
|
||||||
|
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||||
|
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||||
|
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||||
|
if output_mode == "classification":
|
||||||
|
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||||
|
else:
|
||||||
|
raise ValueError('No other `output_mode` for XNLI.')
|
||||||
|
|
||||||
|
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||||
|
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||||
|
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||||
|
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||||
|
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||||
|
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||||
|
parser.add_argument("--language", default=None, type=str, required=True,
|
||||||
|
help="Evaluation language. Also train language if `train_language` is set to None.")
|
||||||
|
parser.add_argument("--train_language", default=None, type=str,
|
||||||
|
help="Train language if is different of the evaluation language.")
|
||||||
|
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||||
|
help="The output directory where the model predictions and checkpoints will be written.")
|
||||||
|
|
||||||
|
## Other parameters
|
||||||
|
parser.add_argument("--config_name", default="", type=str,
|
||||||
|
help="Pretrained config name or path if not the same as model_name")
|
||||||
|
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||||
|
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||||
|
parser.add_argument("--cache_dir", default="", type=str,
|
||||||
|
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||||
|
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||||
|
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||||
|
"than this will be truncated, sequences shorter will be padded.")
|
||||||
|
parser.add_argument("--do_train", action='store_true',
|
||||||
|
help="Whether to run training.")
|
||||||
|
parser.add_argument("--do_eval", action='store_true',
|
||||||
|
help="Whether to run eval on the test set.")
|
||||||
|
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||||
|
help="Rul evaluation during training at each logging step.")
|
||||||
|
parser.add_argument("--do_lower_case", action='store_true',
|
||||||
|
help="Set this flag if you are using an uncased model.")
|
||||||
|
|
||||||
|
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||||
|
help="Batch size per GPU/CPU for training.")
|
||||||
|
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||||
|
help="Batch size per GPU/CPU for evaluation.")
|
||||||
|
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||||
|
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
|
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||||
|
help="The initial learning rate for Adam.")
|
||||||
|
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||||
|
help="Weight deay if we apply some.")
|
||||||
|
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||||
|
help="Epsilon for Adam optimizer.")
|
||||||
|
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||||
|
help="Max gradient norm.")
|
||||||
|
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||||
|
help="Total number of training epochs to perform.")
|
||||||
|
parser.add_argument("--max_steps", default=-1, type=int,
|
||||||
|
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||||
|
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||||
|
help="Linear warmup over warmup_steps.")
|
||||||
|
|
||||||
|
parser.add_argument('--logging_steps', type=int, default=50,
|
||||||
|
help="Log every X updates steps.")
|
||||||
|
parser.add_argument('--save_steps', type=int, default=50,
|
||||||
|
help="Save checkpoint every X updates steps.")
|
||||||
|
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||||
|
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||||
|
parser.add_argument("--no_cuda", action='store_true',
|
||||||
|
help="Avoid using CUDA when available")
|
||||||
|
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||||
|
help="Overwrite the content of the output directory")
|
||||||
|
parser.add_argument('--overwrite_cache', action='store_true',
|
||||||
|
help="Overwrite the cached training and evaluation sets")
|
||||||
|
parser.add_argument('--seed', type=int, default=42,
|
||||||
|
help="random seed for initialization")
|
||||||
|
|
||||||
|
parser.add_argument('--fp16', action='store_true',
|
||||||
|
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||||
|
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||||
|
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||||
|
"See details at https://nvidia.github.io/apex/amp.html")
|
||||||
|
parser.add_argument("--local_rank", type=int, default=-1,
|
||||||
|
help="For distributed training: local_rank")
|
||||||
|
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
||||||
|
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||||
|
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||||
|
|
||||||
|
# Setup distant debugging if needed
|
||||||
|
if args.server_ip and args.server_port:
|
||||||
|
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||||
|
import ptvsd
|
||||||
|
print("Waiting for debugger attach")
|
||||||
|
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||||
|
ptvsd.wait_for_attach()
|
||||||
|
|
||||||
|
# Setup CUDA, GPU & distributed training
|
||||||
|
if args.local_rank == -1 or args.no_cuda:
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||||
|
args.n_gpu = torch.cuda.device_count()
|
||||||
|
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||||
|
torch.cuda.set_device(args.local_rank)
|
||||||
|
device = torch.device("cuda", args.local_rank)
|
||||||
|
torch.distributed.init_process_group(backend='nccl')
|
||||||
|
args.n_gpu = 1
|
||||||
|
args.device = device
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||||
|
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||||
|
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||||
|
|
||||||
|
# Set seed
|
||||||
|
set_seed(args)
|
||||||
|
|
||||||
|
# Prepare XNLI task
|
||||||
|
args.task_name = 'xnli'
|
||||||
|
if args.task_name not in processors:
|
||||||
|
raise ValueError("Task not found: %s" % (args.task_name))
|
||||||
|
processor = processors[args.task_name](language=args.language, train_language=args.train_language)
|
||||||
|
args.output_mode = output_modes[args.task_name]
|
||||||
|
label_list = processor.get_labels()
|
||||||
|
num_labels = len(label_list)
|
||||||
|
|
||||||
|
# Load pretrained model and tokenizer
|
||||||
|
if args.local_rank not in [-1, 0]:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||||
|
|
||||||
|
args.model_type = args.model_type.lower()
|
||||||
|
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||||
|
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||||
|
num_labels=num_labels,
|
||||||
|
finetuning_task=args.task_name,
|
||||||
|
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||||
|
do_lower_case=args.do_lower_case,
|
||||||
|
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||||
|
model = model_class.from_pretrained(args.model_name_or_path,
|
||||||
|
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||||
|
config=config,
|
||||||
|
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||||
|
|
||||||
|
if args.local_rank == 0:
|
||||||
|
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||||
|
|
||||||
|
model.to(args.device)
|
||||||
|
|
||||||
|
logger.info("Training/evaluation parameters %s", args)
|
||||||
|
|
||||||
|
|
||||||
|
# Training
|
||||||
|
if args.do_train:
|
||||||
|
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
|
||||||
|
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||||
|
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||||
|
|
||||||
|
|
||||||
|
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||||
|
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||||
|
# Create output directory if needed
|
||||||
|
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||||
|
os.makedirs(args.output_dir)
|
||||||
|
|
||||||
|
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||||
|
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||||
|
# They can then be reloaded using `from_pretrained()`
|
||||||
|
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
|
model_to_save.save_pretrained(args.output_dir)
|
||||||
|
tokenizer.save_pretrained(args.output_dir)
|
||||||
|
|
||||||
|
# Good practice: save your training arguments together with the trained model
|
||||||
|
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||||
|
|
||||||
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
|
model = model_class.from_pretrained(args.output_dir)
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||||
|
model.to(args.device)
|
||||||
|
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
results = {}
|
||||||
|
if args.do_eval and args.local_rank in [-1, 0]:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
|
checkpoints = [args.output_dir]
|
||||||
|
if args.eval_all_checkpoints:
|
||||||
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
|
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||||
|
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||||
|
for checkpoint in checkpoints:
|
||||||
|
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||||
|
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
|
||||||
|
|
||||||
|
model = model_class.from_pretrained(checkpoint)
|
||||||
|
model.to(args.device)
|
||||||
|
result = evaluate(args, model, tokenizer, prefix=prefix)
|
||||||
|
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||||
|
results.update(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
61
examples/summarization/README.md
Normal file
61
examples/summarization/README.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# Text Summarization with Pretrained Encoders
|
||||||
|
|
||||||
|
This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
|
||||||
|
|
||||||
|
The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
|
||||||
|
|
||||||
|
The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone https://github.com/huggingface/transformers && cd transformers
|
||||||
|
pip install [--editable] .
|
||||||
|
pip install nltk py-rouge
|
||||||
|
cd examples/summarization
|
||||||
|
```
|
||||||
|
|
||||||
|
## Reproduce the authors' results on ROUGE
|
||||||
|
|
||||||
|
To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
|
||||||
|
```
|
||||||
|
|
||||||
|
And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_summarization.py \
|
||||||
|
--documents_dir $DATA_PATH \
|
||||||
|
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
||||||
|
--to_cpu false \
|
||||||
|
--batch_size 4 \
|
||||||
|
--min_length 50 \
|
||||||
|
--max_length 200 \
|
||||||
|
--beam_size 5 \
|
||||||
|
--alpha 0.95 \
|
||||||
|
--block_trigram true \
|
||||||
|
--compute_rouge true
|
||||||
|
```
|
||||||
|
|
||||||
|
The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
|
||||||
|
|
||||||
|
## Summarize any text
|
||||||
|
|
||||||
|
Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_summarization.py \
|
||||||
|
--documents_dir $DATA_PATH \
|
||||||
|
--summaries_output_dir $SUMMARIES_PATH \ # optional
|
||||||
|
--to_cpu false \
|
||||||
|
--batch_size 4 \
|
||||||
|
--min_length 50 \
|
||||||
|
--max_length 200 \
|
||||||
|
--beam_size 5 \
|
||||||
|
--alpha 0.95 \
|
||||||
|
--block_trigram true \
|
||||||
|
```
|
||||||
|
|
||||||
|
You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
|
||||||
119
examples/summarization/configuration_bertabs.py
Normal file
119
examples/summarization/configuration_bertabs.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019 The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" BertAbs configuration """
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
BERTABS_FINETUNED_CONFIG_MAP = {
|
||||||
|
"bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class BertAbsConfig(PretrainedConfig):
|
||||||
|
r""" Class to store the configuration of the BertAbs model.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
max_pos: int
|
||||||
|
The maximum sequence length that this model will be used with.
|
||||||
|
enc_layer: int
|
||||||
|
The numner of hidden layers in the Transformer encoder.
|
||||||
|
enc_hidden_size: int
|
||||||
|
The size of the encoder's layers.
|
||||||
|
enc_heads: int
|
||||||
|
The number of attention heads for each attention layer in the encoder.
|
||||||
|
enc_ff_size: int
|
||||||
|
The size of the encoder's feed-forward layers.
|
||||||
|
enc_dropout: int
|
||||||
|
The dropout probabilitiy for all fully connected layers in the
|
||||||
|
embeddings, layers, pooler and also the attention probabilities in
|
||||||
|
the encoder.
|
||||||
|
dec_layer: int
|
||||||
|
The numner of hidden layers in the decoder.
|
||||||
|
dec_hidden_size: int
|
||||||
|
The size of the decoder's layers.
|
||||||
|
dec_heads: int
|
||||||
|
The number of attention heads for each attention layer in the decoder.
|
||||||
|
dec_ff_size: int
|
||||||
|
The size of the decoder's feed-forward layers.
|
||||||
|
dec_dropout: int
|
||||||
|
The dropout probabilitiy for all fully connected layers in the
|
||||||
|
embeddings, layers, pooler and also the attention probabilities in
|
||||||
|
the decoder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size_or_config_json_file=30522,
|
||||||
|
max_pos=512,
|
||||||
|
enc_layers=6,
|
||||||
|
enc_hidden_size=512,
|
||||||
|
enc_heads=8,
|
||||||
|
enc_ff_size=512,
|
||||||
|
enc_dropout=0.2,
|
||||||
|
dec_layers=6,
|
||||||
|
dec_hidden_size=768,
|
||||||
|
dec_heads=8,
|
||||||
|
dec_ff_size=2048,
|
||||||
|
dec_dropout=0.2,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super(BertAbsConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if self._input_is_path_to_json(vocab_size_or_config_json_file):
|
||||||
|
path_to_json = vocab_size_or_config_json_file
|
||||||
|
with open(path_to_json, "r", encoding="utf-8") as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
|
self.max_pos = max_pos
|
||||||
|
|
||||||
|
self.enc_layers = enc_layers
|
||||||
|
self.enc_hidden_size = enc_hidden_size
|
||||||
|
self.enc_heads = enc_heads
|
||||||
|
self.enc_ff_size = enc_ff_size
|
||||||
|
self.enc_dropout = enc_dropout
|
||||||
|
|
||||||
|
self.dec_layers = dec_layers
|
||||||
|
self.dec_hidden_size = dec_hidden_size
|
||||||
|
self.dec_heads = dec_heads
|
||||||
|
self.dec_ff_size = dec_ff_size
|
||||||
|
self.dec_dropout = dec_dropout
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"First argument must be either a vocabulary size (int)"
|
||||||
|
"or the path to a pretrained model config file (str)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _input_is_path_to_json(self, first_argument):
|
||||||
|
""" Checks whether the first argument passed to config
|
||||||
|
is the path to a JSON file that contains the config.
|
||||||
|
"""
|
||||||
|
is_python_2 = sys.version_info[0] == 2
|
||||||
|
if is_python_2:
|
||||||
|
return isinstance(first_argument, unicode)
|
||||||
|
else:
|
||||||
|
return isinstance(first_argument, str)
|
||||||
@@ -0,0 +1,163 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Convert BertExtAbs's checkpoints.
|
||||||
|
|
||||||
|
The script looks like it is doing something trivial but it is not. The "weights"
|
||||||
|
proposed by the authors are actually the entire model pickled. We need to load
|
||||||
|
the model within the original codebase to be able to only save its `state_dict`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from collections import namedtuple
|
||||||
|
import logging
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from models.model_builder import AbsSummarizer # The authors' implementation
|
||||||
|
from model_bertabs import BertAbsSummarizer
|
||||||
|
|
||||||
|
from transformers import BertTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_TEXT = 'Hello world! cécé herlolip'
|
||||||
|
|
||||||
|
|
||||||
|
BertAbsConfig = namedtuple(
|
||||||
|
"BertAbsConfig",
|
||||||
|
["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
|
||||||
|
""" Copy/paste and tweak the pre-trained weights provided by the creators
|
||||||
|
of BertAbs for the internal architecture.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Instantiate the authors' model with the pre-trained weights
|
||||||
|
config = BertAbsConfig(
|
||||||
|
temp_dir=".",
|
||||||
|
finetune_bert=False,
|
||||||
|
large=False,
|
||||||
|
share_emb=True,
|
||||||
|
use_bert_emb=False,
|
||||||
|
encoder="bert",
|
||||||
|
max_pos=512,
|
||||||
|
enc_layers=6,
|
||||||
|
enc_hidden_size=512,
|
||||||
|
enc_heads=8,
|
||||||
|
enc_ff_size=512,
|
||||||
|
enc_dropout=0.2,
|
||||||
|
dec_layers=6,
|
||||||
|
dec_hidden_size=768,
|
||||||
|
dec_heads=8,
|
||||||
|
dec_ff_size=2048,
|
||||||
|
dec_dropout=0.2,
|
||||||
|
)
|
||||||
|
checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
|
||||||
|
original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
|
||||||
|
original.eval()
|
||||||
|
|
||||||
|
new_model = BertAbsSummarizer(config, torch.device("cpu"))
|
||||||
|
new_model.eval()
|
||||||
|
|
||||||
|
# -------------------
|
||||||
|
# Convert the weights
|
||||||
|
# -------------------
|
||||||
|
|
||||||
|
logging.info("convert the model")
|
||||||
|
new_model.bert.load_state_dict(original.bert.state_dict())
|
||||||
|
new_model.decoder.load_state_dict(original.decoder.state_dict())
|
||||||
|
new_model.generator.load_state_dict(original.generator.state_dict())
|
||||||
|
|
||||||
|
# ----------------------------------
|
||||||
|
# Make sure the outpus are identical
|
||||||
|
# ----------------------------------
|
||||||
|
|
||||||
|
logging.info("Make sure that the models' outputs are identical")
|
||||||
|
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||||
|
|
||||||
|
# prepare the model inputs
|
||||||
|
encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
|
||||||
|
encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
|
||||||
|
encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
|
||||||
|
decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
|
||||||
|
decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
|
||||||
|
decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
|
||||||
|
|
||||||
|
# failsafe to make sure the weights reset does not affect the
|
||||||
|
# loaded weights.
|
||||||
|
assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
|
||||||
|
|
||||||
|
# forward pass
|
||||||
|
src = encoder_input_ids
|
||||||
|
tgt = decoder_input_ids
|
||||||
|
segs = token_type_ids = None
|
||||||
|
clss = None
|
||||||
|
mask_src = encoder_attention_mask = None
|
||||||
|
mask_tgt = decoder_attention_mask = None
|
||||||
|
mask_cls = None
|
||||||
|
|
||||||
|
# The original model does not apply the geneator layer immediatly but rather in
|
||||||
|
# the beam search (where it combines softmax + linear layer). Since we already
|
||||||
|
# apply the softmax in our generation process we only apply the linear layer here.
|
||||||
|
# We make sure that the outputs of the full stack are identical
|
||||||
|
output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
|
||||||
|
output_original_generator = original.generator(output_original_model)
|
||||||
|
|
||||||
|
output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
|
||||||
|
output_converted_generator = new_model.generator(output_converted_model)
|
||||||
|
|
||||||
|
maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
|
||||||
|
print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
|
||||||
|
maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
|
||||||
|
print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
|
||||||
|
|
||||||
|
are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
|
||||||
|
if are_identical:
|
||||||
|
logging.info("all weights are equal up to 1e-3")
|
||||||
|
else:
|
||||||
|
raise ValueError("the weights are different. The new model is likely different from the original one.")
|
||||||
|
|
||||||
|
# The model has been saved with torch.save(model) and this is bound to the exact
|
||||||
|
# directory structure. We save the state_dict instead.
|
||||||
|
logging.info("saving the model's state dictionary")
|
||||||
|
torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--bertabs_checkpoint_path",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path the official PyTorch dump.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pytorch_dump_folder_path",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to the output PyTorch model.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
convert_bertabs_checkpoints(
|
||||||
|
args.bertabs_checkpoint_path,
|
||||||
|
args.pytorch_dump_folder_path,
|
||||||
|
)
|
||||||
1161
examples/summarization/modeling_bertabs.py
Normal file
1161
examples/summarization/modeling_bertabs.py
Normal file
File diff suppressed because it is too large
Load Diff
9
examples/summarization/requirements.txt
Normal file
9
examples/summarization/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# progress bars in model download and training scripts
|
||||||
|
tqdm
|
||||||
|
# Accessing files from S3 directly.
|
||||||
|
boto3
|
||||||
|
# Used for downloading models over HTTP
|
||||||
|
requests
|
||||||
|
# For ROUGE
|
||||||
|
nltk
|
||||||
|
py-rouge
|
||||||
344
examples/summarization/run_summarization.py
Normal file
344
examples/summarization/run_summarization.py
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
#! /usr/bin/python3
|
||||||
|
import argparse
|
||||||
|
from collections import namedtuple
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader, SequentialSampler
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from transformers import BertTokenizer
|
||||||
|
|
||||||
|
from modeling_bertabs import BertAbs, build_predictor
|
||||||
|
|
||||||
|
from utils_summarization import (
|
||||||
|
SummarizationDataset,
|
||||||
|
encode_for_summarization,
|
||||||
|
build_mask,
|
||||||
|
fit_to_block_size,
|
||||||
|
compute_token_type_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
Batch = namedtuple(
|
||||||
|
"Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args):
|
||||||
|
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
|
||||||
|
model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
|
||||||
|
model.to(args.device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
symbols = {
|
||||||
|
"BOS": tokenizer.vocab["[unused0]"],
|
||||||
|
"EOS": tokenizer.vocab["[unused1]"],
|
||||||
|
"PAD": tokenizer.vocab["[PAD]"],
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.compute_rouge:
|
||||||
|
reference_summaries = []
|
||||||
|
generated_summaries = []
|
||||||
|
|
||||||
|
import rouge
|
||||||
|
import nltk
|
||||||
|
nltk.download('punkt')
|
||||||
|
rouge_evaluator = rouge.Rouge(
|
||||||
|
metrics=['rouge-n', 'rouge-l'],
|
||||||
|
max_n=2,
|
||||||
|
limit_length=True,
|
||||||
|
length_limit=args.beam_size,
|
||||||
|
length_limit_type='words',
|
||||||
|
apply_avg=True,
|
||||||
|
apply_best=False,
|
||||||
|
alpha=0.5, # Default F1_score
|
||||||
|
weight_factor=1.2,
|
||||||
|
stemming=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# these (unused) arguments are defined to keep the compatibility
|
||||||
|
# with the legacy code and will be deleted in a next iteration.
|
||||||
|
args.result_path = ""
|
||||||
|
args.temp_dir = ""
|
||||||
|
|
||||||
|
data_iterator = build_data_iterator(args, tokenizer)
|
||||||
|
predictor = build_predictor(args, tokenizer, symbols, model)
|
||||||
|
|
||||||
|
logger.info("***** Running evaluation *****")
|
||||||
|
logger.info(" Number examples = %d", len(data_iterator.dataset))
|
||||||
|
logger.info(" Batch size = %d", args.batch_size)
|
||||||
|
logger.info("")
|
||||||
|
logger.info("***** Beam Search parameters *****")
|
||||||
|
logger.info(" Beam size = %d", args.beam_size)
|
||||||
|
logger.info(" Minimum length = %d", args.min_length)
|
||||||
|
logger.info(" Maximum length = %d", args.max_length)
|
||||||
|
logger.info(" Alpha (length penalty) = %.2f", args.alpha)
|
||||||
|
logger.info(" Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
|
||||||
|
|
||||||
|
for batch in tqdm(data_iterator):
|
||||||
|
batch_data = predictor.translate_batch(batch)
|
||||||
|
translations = predictor.from_batch(batch_data)
|
||||||
|
summaries = [format_summary(t) for t in translations]
|
||||||
|
save_summaries(summaries, args.summaries_output_dir, batch.document_names)
|
||||||
|
|
||||||
|
if args.compute_rouge:
|
||||||
|
reference_summaries += batch.tgt_str
|
||||||
|
generated_summaries += summaries
|
||||||
|
|
||||||
|
if args.compute_rouge:
|
||||||
|
scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
|
||||||
|
str_scores = format_rouge_scores(scores)
|
||||||
|
save_rouge_scores(str_scores)
|
||||||
|
print(str_scores)
|
||||||
|
|
||||||
|
|
||||||
|
def save_summaries(summaries, path, original_document_name):
|
||||||
|
""" Write the summaries in fies that are prefixed by the original
|
||||||
|
files' name with the `_summary` appended.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
original_document_names: List[string]
|
||||||
|
Name of the document that was summarized.
|
||||||
|
path: string
|
||||||
|
Path were the summaries will be written
|
||||||
|
summaries: List[string]
|
||||||
|
The summaries that we produced.
|
||||||
|
"""
|
||||||
|
for summary, document_name in zip(summaries, original_document_name):
|
||||||
|
# Prepare the summary file's name
|
||||||
|
if "." in document_name:
|
||||||
|
bare_document_name = ".".join(document_name.split(".")[:-1])
|
||||||
|
extension = document_name.split(".")[-1]
|
||||||
|
name = bare_document_name + "_summary." + extension
|
||||||
|
else:
|
||||||
|
name = document_name + "_summary"
|
||||||
|
|
||||||
|
file_path = os.path.join(path, name)
|
||||||
|
with open(file_path, "w") as output:
|
||||||
|
output.write(summary)
|
||||||
|
|
||||||
|
|
||||||
|
def format_summary(translation):
|
||||||
|
""" Transforms the output of the `from_batch` function
|
||||||
|
into nicely formatted summaries.
|
||||||
|
"""
|
||||||
|
raw_summary, _, _ = translation
|
||||||
|
summary = (
|
||||||
|
raw_summary.replace("[unused0]", "")
|
||||||
|
.replace("[unused3]", "")
|
||||||
|
.replace("[PAD]", "")
|
||||||
|
.replace("[unused1]", "")
|
||||||
|
.replace(r" +", " ")
|
||||||
|
.replace(" [unused2] ", ". ")
|
||||||
|
.replace("[unused2]", "")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
def format_rouge_scores(scores):
|
||||||
|
return """\n
|
||||||
|
****** ROUGE SCORES ******
|
||||||
|
|
||||||
|
** ROUGE 1
|
||||||
|
F1 >> {:.3f}
|
||||||
|
Precision >> {:.3f}
|
||||||
|
Recall >> {:.3f}
|
||||||
|
|
||||||
|
** ROUGE 2
|
||||||
|
F1 >> {:.3f}
|
||||||
|
Precision >> {:.3f}
|
||||||
|
Recall >> {:.3f}
|
||||||
|
|
||||||
|
** ROUGE L
|
||||||
|
F1 >> {:.3f}
|
||||||
|
Precision >> {:.3f}
|
||||||
|
Recall >> {:.3f}""".format(
|
||||||
|
scores['rouge-1']['f'],
|
||||||
|
scores['rouge-1']['p'],
|
||||||
|
scores['rouge-1']['r'],
|
||||||
|
scores['rouge-2']['f'],
|
||||||
|
scores['rouge-2']['p'],
|
||||||
|
scores['rouge-2']['r'],
|
||||||
|
scores['rouge-l']['f'],
|
||||||
|
scores['rouge-l']['p'],
|
||||||
|
scores['rouge-l']['r'],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_rouge_scores(str_scores):
|
||||||
|
with open("rouge_scores.txt", "w") as output:
|
||||||
|
output.write(str_scores)
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# LOAD the dataset
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
def build_data_iterator(args, tokenizer):
|
||||||
|
dataset = load_and_cache_examples(args, tokenizer)
|
||||||
|
sampler = SequentialSampler(dataset)
|
||||||
|
collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
|
||||||
|
iterator = DataLoader(
|
||||||
|
dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
|
||||||
|
)
|
||||||
|
|
||||||
|
return iterator
|
||||||
|
|
||||||
|
|
||||||
|
def load_and_cache_examples(args, tokenizer):
|
||||||
|
dataset = SummarizationDataset(args.documents_dir)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def collate(data, tokenizer, block_size, device):
|
||||||
|
""" Collate formats the data passed to the data loader.
|
||||||
|
|
||||||
|
In particular we tokenize the data batch after batch to avoid keeping them
|
||||||
|
all in memory. We output the data as a namedtuple to fit the original BertAbs's
|
||||||
|
API.
|
||||||
|
"""
|
||||||
|
data = [x for x in data if not len(x[1]) == 0] # remove empty_files
|
||||||
|
names = [name for name, _, _ in data]
|
||||||
|
summaries = [" ".join(summary_list) for _, _, summary_list in data]
|
||||||
|
|
||||||
|
encoded_text = [
|
||||||
|
encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
|
||||||
|
]
|
||||||
|
encoded_stories = torch.tensor(
|
||||||
|
[
|
||||||
|
fit_to_block_size(story, block_size, tokenizer.pad_token_id)
|
||||||
|
for story, _ in encoded_text
|
||||||
|
]
|
||||||
|
)
|
||||||
|
encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
|
||||||
|
encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
|
||||||
|
|
||||||
|
batch = Batch(
|
||||||
|
document_names=names,
|
||||||
|
batch_size=len(encoded_stories),
|
||||||
|
src=encoded_stories.to(device),
|
||||||
|
segs=encoder_token_type_ids.to(device),
|
||||||
|
mask_src=encoder_mask.to(device),
|
||||||
|
tgt_str=summaries,
|
||||||
|
)
|
||||||
|
|
||||||
|
return batch
|
||||||
|
|
||||||
|
|
||||||
|
def decode_summary(summary_tokens, tokenizer):
|
||||||
|
""" Decode the summary and return it in a format
|
||||||
|
suitable for evaluation.
|
||||||
|
"""
|
||||||
|
summary_tokens = summary_tokens.to("cpu").numpy()
|
||||||
|
summary = tokenizer.decode(summary_tokens)
|
||||||
|
sentences = summary.split(".")
|
||||||
|
sentences = [s + "." for s in sentences]
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
""" The main function defines the interface with the users.
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--documents_dir",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The folder where the documents to summarize are located.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--summaries_output_dir",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--compute_rouge",
|
||||||
|
default=False,
|
||||||
|
type=bool,
|
||||||
|
required=False,
|
||||||
|
help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
|
||||||
|
)
|
||||||
|
# EVALUATION options
|
||||||
|
parser.add_argument(
|
||||||
|
"--no_cuda",
|
||||||
|
default=False,
|
||||||
|
type=bool,
|
||||||
|
help="Whether to force the execution on CPU.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
|
||||||
|
)
|
||||||
|
# BEAM SEARCH arguments
|
||||||
|
parser.add_argument(
|
||||||
|
"--min_length",
|
||||||
|
default=50,
|
||||||
|
type=int,
|
||||||
|
help="Minimum number of tokens for the summaries.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max_length",
|
||||||
|
default=200,
|
||||||
|
type=int,
|
||||||
|
help="Maixmum number of tokens for the summaries.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--beam_size",
|
||||||
|
default=5,
|
||||||
|
type=int,
|
||||||
|
help="The number of beams to start with for each example.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--alpha",
|
||||||
|
default=0.95,
|
||||||
|
type=float,
|
||||||
|
help="The value of alpha for the length penalty in the beam search.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--block_trigram",
|
||||||
|
default=True,
|
||||||
|
type=bool,
|
||||||
|
help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Select device (distibuted not available)
|
||||||
|
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||||
|
|
||||||
|
# Check the existence of directories
|
||||||
|
if not args.summaries_output_dir:
|
||||||
|
args.summaries_output_dir = args.documents_dir
|
||||||
|
|
||||||
|
if not documents_dir_is_valid(args.documents_dir):
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
|
||||||
|
)
|
||||||
|
os.makedirs(args.summaries_output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
evaluate(args)
|
||||||
|
|
||||||
|
|
||||||
|
def documents_dir_is_valid(path):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return False
|
||||||
|
|
||||||
|
file_list = os.listdir(path)
|
||||||
|
if len(file_list) == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -10,9 +10,14 @@ from torch.utils.data import Dataset
|
|||||||
# ------------
|
# ------------
|
||||||
|
|
||||||
|
|
||||||
class CNNDailyMailDataset(Dataset):
|
class SummarizationDataset(Dataset):
|
||||||
""" Abstracts the dataset used to train seq2seq models.
|
""" Abstracts the dataset used to train seq2seq models.
|
||||||
|
|
||||||
|
The class will process the documents that are located in the specified
|
||||||
|
folder. The preprocessing will work on any document that is reasonably
|
||||||
|
formatted. On the CNN/DailyMail dataset it will extract both the story
|
||||||
|
and the summary.
|
||||||
|
|
||||||
CNN/Daily News:
|
CNN/Daily News:
|
||||||
|
|
||||||
The CNN/Daily News raw datasets are downloaded from [1]. The stories are
|
The CNN/Daily News raw datasets are downloaded from [1]. The stories are
|
||||||
@@ -25,33 +30,33 @@ class CNNDailyMailDataset(Dataset):
|
|||||||
[2] https://github.com/abisee/cnn-dailymail/
|
[2] https://github.com/abisee/cnn-dailymail/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, tokenizer, prefix="train", data_dir=""):
|
def __init__(self, path="", prefix="train"):
|
||||||
assert os.path.isdir(data_dir)
|
""" We initialize the class by listing all the documents to summarize.
|
||||||
self.tokenizer = tokenizer
|
Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
|
||||||
|
"""
|
||||||
|
assert os.path.isdir(path)
|
||||||
|
|
||||||
# We initialize the class by listing all the files that contain
|
self.documents = []
|
||||||
# stories and summaries. Files are not read in memory given
|
story_filenames_list = os.listdir(path)
|
||||||
# the size of the corpus.
|
for story_filename in story_filenames_list:
|
||||||
self.stories_path = []
|
if "summary" in story_filename:
|
||||||
datasets = ("cnn", "dailymail")
|
continue
|
||||||
for dataset in datasets:
|
path_to_story = os.path.join(path, story_filename)
|
||||||
path_to_stories = os.path.join(data_dir, dataset, "stories")
|
if not os.path.isfile(path_to_story):
|
||||||
story_filenames_list = os.listdir(path_to_stories)
|
continue
|
||||||
for story_filename in story_filenames_list:
|
self.documents.append(path_to_story)
|
||||||
path_to_story = os.path.join(path_to_stories, story_filename)
|
|
||||||
if not os.path.isfile(path_to_story):
|
|
||||||
continue
|
|
||||||
self.stories_path.append(path_to_story)
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.stories_path)
|
""" Returns the number of documents. """
|
||||||
|
return len(self.documents)
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
story_path = self.stories_path[idx]
|
document_path = self.documents[idx]
|
||||||
with open(story_path, encoding="utf-8") as source:
|
document_name = document_path.split("/")[-1]
|
||||||
|
with open(document_path, encoding="utf-8") as source:
|
||||||
raw_story = source.read()
|
raw_story = source.read()
|
||||||
story_lines, summary_lines = process_story(raw_story)
|
story_lines, summary_lines = process_story(raw_story)
|
||||||
return story_lines, summary_lines
|
return document_name, story_lines, summary_lines
|
||||||
|
|
||||||
|
|
||||||
def process_story(raw_story):
|
def process_story(raw_story):
|
||||||
@@ -81,7 +86,7 @@ def process_story(raw_story):
|
|||||||
story_lines.append(element)
|
story_lines.append(element)
|
||||||
except IndexError:
|
except IndexError:
|
||||||
# if "@highlight" is absent from the file we pop
|
# if "@highlight" is absent from the file we pop
|
||||||
# all elements until there is None.
|
# all elements until there is None, raising an exception.
|
||||||
return story_lines, []
|
return story_lines, []
|
||||||
|
|
||||||
# gather summary lines
|
# gather summary lines
|
||||||
@@ -104,31 +109,22 @@ def _add_missing_period(line):
|
|||||||
# --------------------------
|
# --------------------------
|
||||||
|
|
||||||
|
|
||||||
def fit_to_block_size(sequence, block_size, pad_token):
|
def fit_to_block_size(sequence, block_size, pad_token_id):
|
||||||
""" Adapt the source and target sequences' lengths to the block size.
|
""" Adapt the source and target sequences' lengths to the block size.
|
||||||
If the sequence is shorter than the block size we pad it with -1 ids
|
If the sequence is shorter we append padding token to the right of the sequence.
|
||||||
which correspond to padding tokens.
|
|
||||||
"""
|
"""
|
||||||
if len(sequence) > block_size:
|
if len(sequence) > block_size:
|
||||||
return sequence[:block_size]
|
return sequence[:block_size]
|
||||||
else:
|
else:
|
||||||
sequence.extend([pad_token] * (block_size - len(sequence)))
|
sequence.extend([pad_token_id] * (block_size - len(sequence)))
|
||||||
return sequence
|
return sequence
|
||||||
|
|
||||||
|
|
||||||
def build_lm_labels(sequence, pad_token):
|
def build_mask(sequence, pad_token_id):
|
||||||
""" Padding token, encoded as 0, are represented by the value -1 so they
|
|
||||||
are not taken into account in the loss computation. """
|
|
||||||
padded = sequence.clone()
|
|
||||||
padded[padded == pad_token] = -1
|
|
||||||
return padded
|
|
||||||
|
|
||||||
|
|
||||||
def build_mask(sequence, pad_token):
|
|
||||||
""" Builds the mask. The attention mechanism will only attend to positions
|
""" Builds the mask. The attention mechanism will only attend to positions
|
||||||
with value 1. """
|
with value 1. """
|
||||||
mask = torch.ones_like(sequence)
|
mask = torch.ones_like(sequence)
|
||||||
idx_pad_tokens = sequence == pad_token
|
idx_pad_tokens = sequence == pad_token_id
|
||||||
mask[idx_pad_tokens] = 0
|
mask[idx_pad_tokens] = 0
|
||||||
return mask
|
return mask
|
||||||
|
|
||||||
@@ -138,18 +134,11 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
|
|||||||
as specified in [1] by using `[SEP] [CLS]` tokens to separate
|
as specified in [1] by using `[SEP] [CLS]` tokens to separate
|
||||||
sentences.
|
sentences.
|
||||||
"""
|
"""
|
||||||
story_lines_token_ids = [
|
story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
|
||||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
|
||||||
for line in story_lines
|
|
||||||
]
|
|
||||||
summary_lines_token_ids = [
|
|
||||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
|
||||||
for line in summary_lines
|
|
||||||
]
|
|
||||||
|
|
||||||
story_token_ids = [
|
story_token_ids = [
|
||||||
token for sentence in story_lines_token_ids for token in sentence
|
token for sentence in story_lines_token_ids for token in sentence
|
||||||
]
|
]
|
||||||
|
summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
|
||||||
summary_token_ids = [
|
summary_token_ids = [
|
||||||
token for sentence in summary_lines_token_ids for token in sentence
|
token for sentence in summary_lines_token_ids for token in sentence
|
||||||
]
|
]
|
||||||
@@ -174,7 +163,7 @@ def compute_token_type_ids(batch, separator_token_id):
|
|||||||
"""
|
"""
|
||||||
batch_embeddings = []
|
batch_embeddings = []
|
||||||
for sequence in batch:
|
for sequence in batch:
|
||||||
sentence_num = 0
|
sentence_num = -1
|
||||||
embeddings = []
|
embeddings = []
|
||||||
for s in sequence:
|
for s in sequence:
|
||||||
if s == separator_token_id:
|
if s == separator_token_id:
|
||||||
@@ -21,7 +21,6 @@ from utils_summarization import (
|
|||||||
compute_token_type_ids,
|
compute_token_type_ids,
|
||||||
fit_to_block_size,
|
fit_to_block_size,
|
||||||
build_mask,
|
build_mask,
|
||||||
build_lm_labels,
|
|
||||||
process_story,
|
process_story,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -88,20 +87,6 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
|||||||
expected_summary_lines = ["It was the best of times."]
|
expected_summary_lines = ["It was the best of times."]
|
||||||
self.assertEqual(expected_summary_lines, summary_lines)
|
self.assertEqual(expected_summary_lines, summary_lines)
|
||||||
|
|
||||||
def test_build_lm_labels_no_padding(self):
|
|
||||||
sequence = torch.tensor([1, 2, 3, 4])
|
|
||||||
expected = sequence
|
|
||||||
np.testing.assert_array_equal(
|
|
||||||
build_lm_labels(sequence, 0).numpy(), expected.numpy()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_build_lm_labels(self):
|
|
||||||
sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
|
|
||||||
expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
|
|
||||||
np.testing.assert_array_equal(
|
|
||||||
build_lm_labels(sequence, 0).numpy(), expected.numpy()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_build_mask_no_padding(self):
|
def test_build_mask_no_padding(self):
|
||||||
sequence = torch.tensor([1, 2, 3, 4])
|
sequence = torch.tensor([1, 2, 3, 4])
|
||||||
expected = torch.tensor([1, 1, 1, 1])
|
expected = torch.tensor([1, 1, 1, 1])
|
||||||
@@ -125,7 +110,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
|||||||
[[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
|
[[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
|
||||||
)
|
)
|
||||||
expected = torch.tensor(
|
expected = torch.tensor(
|
||||||
[[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]]
|
[[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
|
||||||
)
|
)
|
||||||
|
|
||||||
result = compute_token_type_ids(batch, separator)
|
result = compute_token_type_ids(batch, separator)
|
||||||
@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase):
|
|||||||
logger.addHandler(stream_handler)
|
logger.addHandler(stream_handler)
|
||||||
|
|
||||||
testargs = ["run_squad.py",
|
testargs = ["run_squad.py",
|
||||||
"--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
|
"--data_dir=./examples/tests_samples/SQUAD",
|
||||||
"--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
|
|
||||||
"--model_name=bert-base-uncased",
|
"--model_name=bert-base-uncased",
|
||||||
"--output_dir=./examples/tests_samples/temp_dir",
|
"--output_dir=./examples/tests_samples/temp_dir",
|
||||||
"--max_steps=10",
|
"--max_steps=10",
|
||||||
|
|||||||
140
examples/tests_samples/SQUAD/train-v2.0.json
Normal file
140
examples/tests_samples/SQUAD/train-v2.0.json
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
{
|
||||||
|
"version": "v2.0",
|
||||||
|
"data": [{
|
||||||
|
"title": "Normans",
|
||||||
|
"paragraphs": [{
|
||||||
|
"qas": [{
|
||||||
|
"question": "In what country is Normandy located?",
|
||||||
|
"id": "56ddde6b9a695914005b9628",
|
||||||
|
"answers": [{
|
||||||
|
"text": "France",
|
||||||
|
"answer_start": 159
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"question": "When were the Normans in Normandy?",
|
||||||
|
"id": "56ddde6b9a695914005b9629",
|
||||||
|
"answers": [{
|
||||||
|
"text": "10th and 11th centuries",
|
||||||
|
"answer_start": 94
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"question": "From which countries did the Norse originate?",
|
||||||
|
"id": "56ddde6b9a695914005b962a",
|
||||||
|
"answers": [{
|
||||||
|
"text": "Denmark, Iceland and Norway",
|
||||||
|
"answer_start": 256
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"plausible_answers": [{
|
||||||
|
"text": "Rollo",
|
||||||
|
"answer_start": 308
|
||||||
|
}],
|
||||||
|
"question": "Who did King Charles III swear fealty to?",
|
||||||
|
"id": "5ad39d53604f3c001a3fe8d3",
|
||||||
|
"answers": [],
|
||||||
|
"is_impossible": true
|
||||||
|
}, {
|
||||||
|
"plausible_answers": [{
|
||||||
|
"text": "10th century",
|
||||||
|
"answer_start": 671
|
||||||
|
}],
|
||||||
|
"question": "When did the Frankish identity emerge?",
|
||||||
|
"id": "5ad39d53604f3c001a3fe8d4",
|
||||||
|
"answers": [],
|
||||||
|
"is_impossible": true
|
||||||
|
}],
|
||||||
|
"context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
|
||||||
|
}, {
|
||||||
|
"qas": [{
|
||||||
|
"question": "Who was the duke in the battle of Hastings?",
|
||||||
|
"id": "56dddf4066d3e219004dad5f",
|
||||||
|
"answers": [{
|
||||||
|
"text": "William the Conqueror",
|
||||||
|
"answer_start": 1022
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"plausible_answers": [{
|
||||||
|
"text": "Antioch",
|
||||||
|
"answer_start": 1295
|
||||||
|
}],
|
||||||
|
"question": "What principality did William the conquerer found?",
|
||||||
|
"id": "5ad3a266604f3c001a3fea2b",
|
||||||
|
"answers": [],
|
||||||
|
"is_impossible": true
|
||||||
|
}],
|
||||||
|
"context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"title": "Computational_complexity_theory",
|
||||||
|
"paragraphs": [{
|
||||||
|
"qas": [{
|
||||||
|
"question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
|
||||||
|
"id": "56e16182e3433e1400422e28",
|
||||||
|
"answers": [{
|
||||||
|
"text": "Computational complexity theory",
|
||||||
|
"answer_start": 0
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"plausible_answers": [{
|
||||||
|
"text": "algorithm",
|
||||||
|
"answer_start": 472
|
||||||
|
}],
|
||||||
|
"question": "What is a manual application of mathematical steps?",
|
||||||
|
"id": "5ad5316b5b96ef001a10ab76",
|
||||||
|
"answers": [],
|
||||||
|
"is_impossible": true
|
||||||
|
}],
|
||||||
|
"context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
|
||||||
|
}, {
|
||||||
|
"qas": [{
|
||||||
|
"question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
|
||||||
|
"id": "56e16839cd28a01900c67887",
|
||||||
|
"answers": [{
|
||||||
|
"text": "if its solution requires significant resources",
|
||||||
|
"answer_start": 46
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
|
||||||
|
"id": "56e16839cd28a01900c67888",
|
||||||
|
"answers": [{
|
||||||
|
"text": "mathematical models of computation",
|
||||||
|
"answer_start": 176
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"question": "What are two basic primary resources used to guage complexity?",
|
||||||
|
"id": "56e16839cd28a01900c67889",
|
||||||
|
"answers": [{
|
||||||
|
"text": "time and storage",
|
||||||
|
"answer_start": 305
|
||||||
|
}],
|
||||||
|
"is_impossible": false
|
||||||
|
}, {
|
||||||
|
"plausible_answers": [{
|
||||||
|
"text": "the number of gates in a circuit",
|
||||||
|
"answer_start": 436
|
||||||
|
}],
|
||||||
|
"question": "What unit is measured to determine circuit simplicity?",
|
||||||
|
"id": "5ad532575b96ef001a10ab7f",
|
||||||
|
"answers": [],
|
||||||
|
"is_impossible": true
|
||||||
|
}, {
|
||||||
|
"plausible_answers": [{
|
||||||
|
"text": "the number of processors",
|
||||||
|
"answer_start": 502
|
||||||
|
}],
|
||||||
|
"question": "What number is used in perpendicular computing?",
|
||||||
|
"id": "5ad532575b96ef001a10ab80",
|
||||||
|
"answers": [],
|
||||||
|
"is_impossible": true
|
||||||
|
}],
|
||||||
|
"context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,330 +0,0 @@
|
|||||||
""" Official evaluation script for SQuAD version 2.0.
|
|
||||||
Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
|
|
||||||
|
|
||||||
In addition to basic functionality, we also compute additional statistics and
|
|
||||||
plot precision-recall curves if an additional na_prob.json file is provided.
|
|
||||||
This file is expected to map question ID's to the model's predicted probability
|
|
||||||
that a question is unanswerable.
|
|
||||||
"""
|
|
||||||
import argparse
|
|
||||||
import collections
|
|
||||||
import json
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import string
|
|
||||||
import sys
|
|
||||||
|
|
||||||
class EVAL_OPTS():
|
|
||||||
def __init__(self, data_file, pred_file, out_file="",
|
|
||||||
na_prob_file="na_prob.json", na_prob_thresh=1.0,
|
|
||||||
out_image_dir=None, verbose=False):
|
|
||||||
self.data_file = data_file
|
|
||||||
self.pred_file = pred_file
|
|
||||||
self.out_file = out_file
|
|
||||||
self.na_prob_file = na_prob_file
|
|
||||||
self.na_prob_thresh = na_prob_thresh
|
|
||||||
self.out_image_dir = out_image_dir
|
|
||||||
self.verbose = verbose
|
|
||||||
|
|
||||||
OPTS = None
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
|
|
||||||
parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
|
|
||||||
parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
|
|
||||||
parser.add_argument('--out-file', '-o', metavar='eval.json',
|
|
||||||
help='Write accuracy metrics to file (default is stdout).')
|
|
||||||
parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
|
|
||||||
help='Model estimates of probability of no answer.')
|
|
||||||
parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
|
|
||||||
help='Predict "" if no-answer probability exceeds this (default = 1.0).')
|
|
||||||
parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
|
|
||||||
help='Save precision-recall curves to directory.')
|
|
||||||
parser.add_argument('--verbose', '-v', action='store_true')
|
|
||||||
if len(sys.argv) == 1:
|
|
||||||
parser.print_help()
|
|
||||||
sys.exit(1)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def make_qid_to_has_ans(dataset):
|
|
||||||
qid_to_has_ans = {}
|
|
||||||
for article in dataset:
|
|
||||||
for p in article['paragraphs']:
|
|
||||||
for qa in p['qas']:
|
|
||||||
qid_to_has_ans[qa['id']] = bool(qa['answers'])
|
|
||||||
return qid_to_has_ans
|
|
||||||
|
|
||||||
def normalize_answer(s):
|
|
||||||
"""Lower text and remove punctuation, articles and extra whitespace."""
|
|
||||||
def remove_articles(text):
|
|
||||||
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
|
|
||||||
return re.sub(regex, ' ', text)
|
|
||||||
def white_space_fix(text):
|
|
||||||
return ' '.join(text.split())
|
|
||||||
def remove_punc(text):
|
|
||||||
exclude = set(string.punctuation)
|
|
||||||
return ''.join(ch for ch in text if ch not in exclude)
|
|
||||||
def lower(text):
|
|
||||||
return text.lower()
|
|
||||||
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
|
||||||
|
|
||||||
def get_tokens(s):
|
|
||||||
if not s: return []
|
|
||||||
return normalize_answer(s).split()
|
|
||||||
|
|
||||||
def compute_exact(a_gold, a_pred):
|
|
||||||
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
|
|
||||||
|
|
||||||
def compute_f1(a_gold, a_pred):
|
|
||||||
gold_toks = get_tokens(a_gold)
|
|
||||||
pred_toks = get_tokens(a_pred)
|
|
||||||
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
|
|
||||||
num_same = sum(common.values())
|
|
||||||
if len(gold_toks) == 0 or len(pred_toks) == 0:
|
|
||||||
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
|
|
||||||
return int(gold_toks == pred_toks)
|
|
||||||
if num_same == 0:
|
|
||||||
return 0
|
|
||||||
precision = 1.0 * num_same / len(pred_toks)
|
|
||||||
recall = 1.0 * num_same / len(gold_toks)
|
|
||||||
f1 = (2 * precision * recall) / (precision + recall)
|
|
||||||
return f1
|
|
||||||
|
|
||||||
def get_raw_scores(dataset, preds):
|
|
||||||
exact_scores = {}
|
|
||||||
f1_scores = {}
|
|
||||||
for article in dataset:
|
|
||||||
for p in article['paragraphs']:
|
|
||||||
for qa in p['qas']:
|
|
||||||
qid = qa['id']
|
|
||||||
gold_answers = [a['text'] for a in qa['answers']
|
|
||||||
if normalize_answer(a['text'])]
|
|
||||||
if not gold_answers:
|
|
||||||
# For unanswerable questions, only correct answer is empty string
|
|
||||||
gold_answers = ['']
|
|
||||||
if qid not in preds:
|
|
||||||
print('Missing prediction for %s' % qid)
|
|
||||||
continue
|
|
||||||
a_pred = preds[qid]
|
|
||||||
# Take max over all gold answers
|
|
||||||
exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
|
|
||||||
f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
|
|
||||||
return exact_scores, f1_scores
|
|
||||||
|
|
||||||
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
|
|
||||||
new_scores = {}
|
|
||||||
for qid, s in scores.items():
|
|
||||||
pred_na = na_probs[qid] > na_prob_thresh
|
|
||||||
if pred_na:
|
|
||||||
new_scores[qid] = float(not qid_to_has_ans[qid])
|
|
||||||
else:
|
|
||||||
new_scores[qid] = s
|
|
||||||
return new_scores
|
|
||||||
|
|
||||||
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
|
|
||||||
if not qid_list:
|
|
||||||
total = len(exact_scores)
|
|
||||||
return collections.OrderedDict([
|
|
||||||
('exact', 100.0 * sum(exact_scores.values()) / total),
|
|
||||||
('f1', 100.0 * sum(f1_scores.values()) / total),
|
|
||||||
('total', total),
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
total = len(qid_list)
|
|
||||||
return collections.OrderedDict([
|
|
||||||
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
|
|
||||||
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
|
|
||||||
('total', total),
|
|
||||||
])
|
|
||||||
|
|
||||||
def merge_eval(main_eval, new_eval, prefix):
|
|
||||||
for k in new_eval:
|
|
||||||
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
|
|
||||||
|
|
||||||
def plot_pr_curve(precisions, recalls, out_image, title):
|
|
||||||
plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
|
|
||||||
plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
|
|
||||||
plt.xlabel('Recall')
|
|
||||||
plt.ylabel('Precision')
|
|
||||||
plt.xlim([0.0, 1.05])
|
|
||||||
plt.ylim([0.0, 1.05])
|
|
||||||
plt.title(title)
|
|
||||||
plt.savefig(out_image)
|
|
||||||
plt.clf()
|
|
||||||
|
|
||||||
def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
|
|
||||||
out_image=None, title=None):
|
|
||||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
|
||||||
true_pos = 0.0
|
|
||||||
cur_p = 1.0
|
|
||||||
cur_r = 0.0
|
|
||||||
precisions = [1.0]
|
|
||||||
recalls = [0.0]
|
|
||||||
avg_prec = 0.0
|
|
||||||
for i, qid in enumerate(qid_list):
|
|
||||||
if qid_to_has_ans[qid]:
|
|
||||||
true_pos += scores[qid]
|
|
||||||
cur_p = true_pos / float(i+1)
|
|
||||||
cur_r = true_pos / float(num_true_pos)
|
|
||||||
if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
|
|
||||||
# i.e., if we can put a threshold after this point
|
|
||||||
avg_prec += cur_p * (cur_r - recalls[-1])
|
|
||||||
precisions.append(cur_p)
|
|
||||||
recalls.append(cur_r)
|
|
||||||
if out_image:
|
|
||||||
plot_pr_curve(precisions, recalls, out_image, title)
|
|
||||||
return {'ap': 100.0 * avg_prec}
|
|
||||||
|
|
||||||
def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs,
|
|
||||||
qid_to_has_ans, out_image_dir):
|
|
||||||
if out_image_dir and not os.path.exists(out_image_dir):
|
|
||||||
os.makedirs(out_image_dir)
|
|
||||||
num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
|
|
||||||
if num_true_pos == 0:
|
|
||||||
return
|
|
||||||
pr_exact = make_precision_recall_eval(
|
|
||||||
exact_raw, na_probs, num_true_pos, qid_to_has_ans,
|
|
||||||
out_image=os.path.join(out_image_dir, 'pr_exact.png'),
|
|
||||||
title='Precision-Recall curve for Exact Match score')
|
|
||||||
pr_f1 = make_precision_recall_eval(
|
|
||||||
f1_raw, na_probs, num_true_pos, qid_to_has_ans,
|
|
||||||
out_image=os.path.join(out_image_dir, 'pr_f1.png'),
|
|
||||||
title='Precision-Recall curve for F1 score')
|
|
||||||
oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
|
|
||||||
pr_oracle = make_precision_recall_eval(
|
|
||||||
oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
|
|
||||||
out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
|
|
||||||
title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
|
|
||||||
merge_eval(main_eval, pr_exact, 'pr_exact')
|
|
||||||
merge_eval(main_eval, pr_f1, 'pr_f1')
|
|
||||||
merge_eval(main_eval, pr_oracle, 'pr_oracle')
|
|
||||||
|
|
||||||
def histogram_na_prob(na_probs, qid_list, image_dir, name):
|
|
||||||
if not qid_list:
|
|
||||||
return
|
|
||||||
x = [na_probs[k] for k in qid_list]
|
|
||||||
weights = np.ones_like(x) / float(len(x))
|
|
||||||
plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
|
|
||||||
plt.xlabel('Model probability of no-answer')
|
|
||||||
plt.ylabel('Proportion of dataset')
|
|
||||||
plt.title('Histogram of no-answer probability: %s' % name)
|
|
||||||
plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
|
|
||||||
plt.clf()
|
|
||||||
|
|
||||||
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
|
|
||||||
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
|
||||||
cur_score = num_no_ans
|
|
||||||
best_score = cur_score
|
|
||||||
best_thresh = 0.0
|
|
||||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
|
||||||
for i, qid in enumerate(qid_list):
|
|
||||||
if qid not in scores: continue
|
|
||||||
if qid_to_has_ans[qid]:
|
|
||||||
diff = scores[qid]
|
|
||||||
else:
|
|
||||||
if preds[qid]:
|
|
||||||
diff = -1
|
|
||||||
else:
|
|
||||||
diff = 0
|
|
||||||
cur_score += diff
|
|
||||||
if cur_score > best_score:
|
|
||||||
best_score = cur_score
|
|
||||||
best_thresh = na_probs[qid]
|
|
||||||
return 100.0 * best_score / len(scores), best_thresh
|
|
||||||
|
|
||||||
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
|
|
||||||
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
|
||||||
cur_score = num_no_ans
|
|
||||||
best_score = cur_score
|
|
||||||
best_thresh = 0.0
|
|
||||||
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
|
||||||
for i, qid in enumerate(qid_list):
|
|
||||||
if qid not in scores: continue
|
|
||||||
if qid_to_has_ans[qid]:
|
|
||||||
diff = scores[qid]
|
|
||||||
else:
|
|
||||||
if preds[qid]:
|
|
||||||
diff = -1
|
|
||||||
else:
|
|
||||||
diff = 0
|
|
||||||
cur_score += diff
|
|
||||||
if cur_score > best_score:
|
|
||||||
best_score = cur_score
|
|
||||||
best_thresh = na_probs[qid]
|
|
||||||
|
|
||||||
has_ans_score, has_ans_cnt = 0, 0
|
|
||||||
for qid in qid_list:
|
|
||||||
if not qid_to_has_ans[qid]: continue
|
|
||||||
has_ans_cnt += 1
|
|
||||||
|
|
||||||
if qid not in scores: continue
|
|
||||||
has_ans_score += scores[qid]
|
|
||||||
|
|
||||||
return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
|
|
||||||
|
|
||||||
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
|
||||||
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
|
|
||||||
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
|
|
||||||
main_eval['best_exact'] = best_exact
|
|
||||||
main_eval['best_exact_thresh'] = exact_thresh
|
|
||||||
main_eval['best_f1'] = best_f1
|
|
||||||
main_eval['best_f1_thresh'] = f1_thresh
|
|
||||||
|
|
||||||
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
|
||||||
best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
|
|
||||||
best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
|
|
||||||
main_eval['best_exact'] = best_exact
|
|
||||||
main_eval['best_exact_thresh'] = exact_thresh
|
|
||||||
main_eval['best_f1'] = best_f1
|
|
||||||
main_eval['best_f1_thresh'] = f1_thresh
|
|
||||||
main_eval['has_ans_exact'] = has_ans_exact
|
|
||||||
main_eval['has_ans_f1'] = has_ans_f1
|
|
||||||
|
|
||||||
def main(OPTS):
|
|
||||||
with open(OPTS.data_file) as f:
|
|
||||||
dataset_json = json.load(f)
|
|
||||||
dataset = dataset_json['data']
|
|
||||||
with open(OPTS.pred_file) as f:
|
|
||||||
preds = json.load(f)
|
|
||||||
if OPTS.na_prob_file:
|
|
||||||
with open(OPTS.na_prob_file) as f:
|
|
||||||
na_probs = json.load(f)
|
|
||||||
else:
|
|
||||||
na_probs = {k: 0.0 for k in preds}
|
|
||||||
qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False
|
|
||||||
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
|
|
||||||
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
|
|
||||||
exact_raw, f1_raw = get_raw_scores(dataset, preds)
|
|
||||||
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
|
|
||||||
OPTS.na_prob_thresh)
|
|
||||||
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
|
|
||||||
OPTS.na_prob_thresh)
|
|
||||||
out_eval = make_eval_dict(exact_thresh, f1_thresh)
|
|
||||||
if has_ans_qids:
|
|
||||||
has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
|
|
||||||
merge_eval(out_eval, has_ans_eval, 'HasAns')
|
|
||||||
if no_ans_qids:
|
|
||||||
no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
|
|
||||||
merge_eval(out_eval, no_ans_eval, 'NoAns')
|
|
||||||
if OPTS.na_prob_file:
|
|
||||||
find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
|
|
||||||
if OPTS.na_prob_file and OPTS.out_image_dir:
|
|
||||||
run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs,
|
|
||||||
qid_to_has_ans, OPTS.out_image_dir)
|
|
||||||
histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
|
|
||||||
histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
|
|
||||||
if OPTS.out_file:
|
|
||||||
with open(OPTS.out_file, 'w') as f:
|
|
||||||
json.dump(out_eval, f)
|
|
||||||
else:
|
|
||||||
print(json.dumps(out_eval, indent=2))
|
|
||||||
return out_eval
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
OPTS = parse_args()
|
|
||||||
if OPTS.out_image_dir:
|
|
||||||
import matplotlib
|
|
||||||
matplotlib.use('Agg')
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
main(OPTS)
|
|
||||||
13
setup.py
13
setup.py
@@ -36,9 +36,15 @@ To create the package for pypi.
|
|||||||
from io import open
|
from io import open
|
||||||
from setuptools import find_packages, setup
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
|
||||||
|
extras = {
|
||||||
|
'serving': ['uvicorn', 'fastapi']
|
||||||
|
}
|
||||||
|
extras['all'] = [package for package in extras.values()]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="transformers",
|
name="transformers",
|
||||||
version="2.1.1",
|
version="2.2.1",
|
||||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||||
author_email="thomas@huggingface.co",
|
author_email="thomas@huggingface.co",
|
||||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||||
@@ -61,8 +67,11 @@ setup(
|
|||||||
"transformers=transformers.__main__:main",
|
"transformers=transformers.__main__:main",
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
extras_require=extras,
|
||||||
|
scripts=[
|
||||||
|
'transformers-cli'
|
||||||
|
],
|
||||||
# python_requires='>=3.5.0',
|
# python_requires='>=3.5.0',
|
||||||
tests_require=['pytest'],
|
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Intended Audience :: Science/Research',
|
'Intended Audience :: Science/Research',
|
||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||||
RawResult, write_predictions,
|
RawResult, write_predictions,
|
||||||
@@ -98,7 +98,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ The library is designed to incorporate a variety of models and code bases. As su
|
|||||||
One important point though is that the library has the following goals impacting the way models are incorporated:
|
One important point though is that the library has the following goals impacting the way models are incorporated:
|
||||||
|
|
||||||
- one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter.
|
- one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter.
|
||||||
- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificites includes `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.
|
- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.
|
||||||
|
|
||||||
For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html).
|
For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html).
|
||||||
|
|
||||||
@@ -20,7 +20,7 @@ Here an overview of the general workflow:
|
|||||||
- [ ] add tests
|
- [ ] add tests
|
||||||
- [ ] finalize
|
- [ ] finalize
|
||||||
|
|
||||||
Let's details what should be done at each step
|
Let's detail what should be done at each step
|
||||||
|
|
||||||
## Adding model/configuration/tokenization classes
|
## Adding model/configuration/tokenization classes
|
||||||
|
|
||||||
@@ -28,16 +28,16 @@ Here is the workflow for adding model/configuration/tokenization classes:
|
|||||||
|
|
||||||
- [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name,
|
- [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name,
|
||||||
- [ ] edit the files to replace `XXX` (with various casing) with your model name
|
- [ ] edit the files to replace `XXX` (with various casing) with your model name
|
||||||
- [ ] copy-past or create a simple configuration class for your model in the `configuration_...` file
|
- [ ] copy-paste or create a simple configuration class for your model in the `configuration_...` file
|
||||||
- [ ] copy-past or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
|
- [ ] copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
|
||||||
- [ ] copy-past or create a tokenizer class for your model in the `tokenization_...` file
|
- [ ] copy-paste or create a tokenizer class for your model in the `tokenization_...` file
|
||||||
|
|
||||||
# Adding conversion scripts
|
# Adding conversion scripts
|
||||||
|
|
||||||
Here is the workflow for the conversion scripts:
|
Here is the workflow for the conversion scripts:
|
||||||
|
|
||||||
- [ ] copy the conversion script (`convert_...`) from the present folder to the main folder.
|
- [ ] copy the conversion script (`convert_...`) from the present folder to the main folder.
|
||||||
- [ ] edit this scipt to convert your original checkpoint weights to the current pytorch ones.
|
- [ ] edit this script to convert your original checkpoint weights to the current pytorch ones.
|
||||||
|
|
||||||
# Adding tests:
|
# Adding tests:
|
||||||
|
|
||||||
@@ -58,5 +58,5 @@ You can then finish the addition step by adding imports for your classes in the
|
|||||||
- [ ] add your models and tokenizer to `pipeline.py`
|
- [ ] add your models and tokenizer to `pipeline.py`
|
||||||
- [ ] add a link to your conversion script in the main conversion utility (currently in `__main__` but will be moved to the `commands` subfolder in the near future)
|
- [ ] add a link to your conversion script in the main conversion utility (currently in `__main__` but will be moved to the `commands` subfolder in the near future)
|
||||||
- [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
|
- [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
|
||||||
- [ ] add a mention of your model in the doc: `README.md` and the documentation it-self at `docs/source/pretrained_models.rst`.
|
- [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`.
|
||||||
- [ ] upload the pretrained weigths, configurations and vocabulary files.
|
- [ ] upload the pretrained weigths, configurations and vocabulary files.
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ import numpy as np
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from .configuration_xxx import XxxConfig
|
from .configuration_xxx import XxxConfig
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||||
from .file_utils import add_start_docstrings
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -51,7 +51,7 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
####################################################
|
####################################################
|
||||||
# TF 2.0 Models are constructed using Keras imperative API by sub-classing
|
# TF 2.0 Models are constructed using Keras imperative API by sub-classing
|
||||||
# - tf.keras.layers.Layer for the layers and
|
# - tf.keras.layers.Layer for the layers and
|
||||||
# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
|
# - TFPreTrainedModel for the models (itself a sub-class of tf.keras.Model)
|
||||||
####################################################
|
####################################################
|
||||||
|
|
||||||
####################################################
|
####################################################
|
||||||
@@ -123,9 +123,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
|
|||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = tf.fill(tf.shape(input_ids), 1)
|
attention_mask = tf.fill(shape_list(input_ids), 1)
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
token_type_ids = tf.fill(tf.shape(input_ids), 0)
|
token_type_ids = tf.fill(shape_list(input_ids), 0)
|
||||||
|
|
||||||
# We create a 3D attention mask from a 2D tensor mask.
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
@@ -257,6 +257,10 @@ XXX_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
|
|||||||
####################################################
|
####################################################
|
||||||
# PyTorch Models are constructed by sub-classing
|
# PyTorch Models are constructed by sub-classing
|
||||||
# - torch.nn.Module for the layers and
|
# - torch.nn.Module for the layers and
|
||||||
# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
|
# - PreTrainedModel for the models (itself a sub-class of torch.nn.Module)
|
||||||
####################################################
|
####################################################
|
||||||
|
|
||||||
####################################################
|
####################################################
|
||||||
@@ -240,6 +240,10 @@ XXX_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
@@ -296,11 +300,22 @@ class XxxModel(XxxPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = torch.ones_like(input_ids)
|
attention_mask = torch.ones(input_shape, device=device)
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
token_type_ids = torch.zeros_like(input_ids)
|
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||||
|
|
||||||
# We create a 3D attention mask from a 2D tensor mask.
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
@@ -334,7 +349,7 @@ class XxxModel(XxxPreTrainedModel):
|
|||||||
|
|
||||||
##################################
|
##################################
|
||||||
# Replace this with your model code
|
# Replace this with your model code
|
||||||
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
|
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
|
||||||
encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
|
encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
|
||||||
sequence_output = encoder_outputs[0]
|
sequence_output = encoder_outputs[0]
|
||||||
outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
|
outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
|
||||||
@@ -385,14 +400,15 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head
|
return self.lm_head
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
masked_lm_labels=None):
|
masked_lm_labels=None):
|
||||||
|
|
||||||
outputs = self.transformer(input_ids,
|
outputs = self.transformer(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
prediction_scores = self.cls(sequence_output)
|
prediction_scores = self.cls(sequence_output)
|
||||||
@@ -450,14 +466,15 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None, labels=None):
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
outputs = self.transformer(input_ids,
|
outputs = self.transformer(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
@@ -521,14 +538,15 @@ class XxxForTokenClassification(XxxPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None, labels=None):
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
outputs = self.transformer(input_ids,
|
outputs = self.transformer(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
@@ -604,14 +622,15 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
start_positions=None, end_positions=None):
|
start_positions=None, end_positions=None):
|
||||||
|
|
||||||
outputs = self.transformer(input_ids,
|
outputs = self.transformer(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_tf, slow
|
||||||
|
|
||||||
from transformers import XxxConfig, is_tf_available
|
from transformers import XxxConfig, is_tf_available
|
||||||
|
|
||||||
@@ -33,10 +33,9 @@ if is_tf_available():
|
|||||||
TFXxxForTokenClassification,
|
TFXxxForTokenClassification,
|
||||||
TFXxxForQuestionAnswering,
|
TFXxxForQuestionAnswering,
|
||||||
TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_tf
|
||||||
class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
|
all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
|
||||||
@@ -244,7 +243,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in ['xxx-base-uncased']:
|
for model_name in ['xxx-base-uncased']:
|
||||||
|
|||||||
@@ -18,12 +18,12 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
|
||||||
|
|
||||||
from transformers import is_torch_available
|
from transformers import is_torch_available
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
from .configuration_common_test import ConfigTester
|
from .configuration_common_test import ConfigTester
|
||||||
|
from .utils import require_torch, slow, torch_device
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
|
from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
|
||||||
@@ -31,10 +31,9 @@ if is_torch_available():
|
|||||||
XxxForQuestionAnswering, XxxForSequenceClassification,
|
XxxForQuestionAnswering, XxxForSequenceClassification,
|
||||||
XxxForTokenClassification, XxxForMultipleChoice)
|
XxxForTokenClassification, XxxForMultipleChoice)
|
||||||
from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
|
from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
else:
|
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
class XxxModelTest(CommonTestCases.CommonModelTester):
|
class XxxModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
|
all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
|
||||||
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = XxxModel(config=config)
|
model = XxxModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = XxxForMaskedLM(config=config)
|
model = XxxForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
|
|
||||||
def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = XxxForQuestionAnswering(config=config)
|
model = XxxForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = XxxForSequenceClassification(config)
|
model = XxxForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = XxxForTokenClassification(config=config)
|
model = XxxForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
cache_dir = "/tmp/transformers_test/"
|
cache_dir = "/tmp/transformers_test/"
|
||||||
for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
|||||||
@@ -172,7 +172,7 @@ class XxxTokenizer(PreTrainedTokenizer):
|
|||||||
special tokens for the model
|
special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
|
|||||||
23
transformers-cli
Normal file
23
transformers-cli
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
from transformers.commands.user import UserCommands
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser(description='Transformers CLI tool', usage='transformers-cli <command> [<args>]')
|
||||||
|
commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
|
||||||
|
|
||||||
|
# Register commands
|
||||||
|
UserCommands.register_subcommand(commands_parser)
|
||||||
|
|
||||||
|
# Let's go
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not hasattr(args, 'func'):
|
||||||
|
parser.print_help()
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Run
|
||||||
|
service = args.func(args)
|
||||||
|
service.run()
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
__version__ = "2.1.1"
|
__version__ = "2.2.1"
|
||||||
|
|
||||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||||
# default Python logging output behavior when present.
|
# default Python logging output behavior when present.
|
||||||
@@ -25,10 +25,13 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH
|
|||||||
from .data import (is_sklearn_available,
|
from .data import (is_sklearn_available,
|
||||||
InputExample, InputFeatures, DataProcessor,
|
InputExample, InputFeatures, DataProcessor,
|
||||||
glue_output_modes, glue_convert_examples_to_features,
|
glue_output_modes, glue_convert_examples_to_features,
|
||||||
glue_processors, glue_tasks_num_labels)
|
glue_processors, glue_tasks_num_labels,
|
||||||
|
xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
|
||||||
|
squad_convert_examples_to_features, SquadFeatures,
|
||||||
|
SquadExample, SquadV1Processor, SquadV2Processor)
|
||||||
|
|
||||||
if is_sklearn_available():
|
if is_sklearn_available():
|
||||||
from .data import glue_compute_metrics
|
from .data import glue_compute_metrics, xnli_compute_metrics
|
||||||
|
|
||||||
# Tokenizers
|
# Tokenizers
|
||||||
from .tokenization_utils import (PreTrainedTokenizer)
|
from .tokenization_utils import (PreTrainedTokenizer)
|
||||||
@@ -42,6 +45,8 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
|||||||
from .tokenization_xlm import XLMTokenizer
|
from .tokenization_xlm import XLMTokenizer
|
||||||
from .tokenization_roberta import RobertaTokenizer
|
from .tokenization_roberta import RobertaTokenizer
|
||||||
from .tokenization_distilbert import DistilBertTokenizer
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
|
from .tokenization_albert import AlbertTokenizer
|
||||||
|
from .tokenization_camembert import CamembertTokenizer
|
||||||
from .tokenization_t5 import T5Tokenizer
|
from .tokenization_t5 import T5Tokenizer
|
||||||
|
|
||||||
# Configurations
|
# Configurations
|
||||||
@@ -56,6 +61,8 @@ from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MA
|
|||||||
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
|
from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
# Modeling
|
# Modeling
|
||||||
@@ -73,7 +80,8 @@ if is_torch_available():
|
|||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
||||||
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
||||||
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
AdaptiveEmbedding,
|
||||||
|
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
|
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
||||||
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
@@ -81,9 +89,10 @@ if is_torch_available():
|
|||||||
CTRLLMHeadModel,
|
CTRLLMHeadModel,
|
||||||
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
||||||
XLNetForSequenceClassification, XLNetForMultipleChoice,
|
XLNetForSequenceClassification, XLNetForTokenClassification,
|
||||||
XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
|
XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
|
||||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
|
||||||
|
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
||||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||||
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
||||||
@@ -92,22 +101,31 @@ if is_torch_available():
|
|||||||
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
||||||
RobertaForTokenClassification,
|
RobertaForTokenClassification,
|
||||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
|
||||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||||
|
DistilBertForTokenClassification,
|
||||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
from .modeling_camembert import (CamembertForMaskedLM, CamembertModel,
|
||||||
|
CamembertForSequenceClassification, CamembertForMultipleChoice,
|
||||||
|
CamembertForTokenClassification,
|
||||||
|
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||||
from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
|
from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
|
||||||
load_tf_weights_in_t5,
|
load_tf_weights_in_t5,
|
||||||
T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
|
||||||
|
AlbertForQuestionAnswering,
|
||||||
|
load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
|
||||||
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
|
||||||
|
|
||||||
|
|
||||||
# TensorFlow
|
# TensorFlow
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
|
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
|
||||||
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
||||||
TFAutoModelWithLMHead)
|
TFAutoModelWithLMHead)
|
||||||
|
|
||||||
@@ -133,6 +151,7 @@ if is_tf_available():
|
|||||||
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
||||||
TFXLNetModel, TFXLNetLMHeadModel,
|
TFXLNetModel, TFXLNetLMHeadModel,
|
||||||
TFXLNetForSequenceClassification,
|
TFXLNetForSequenceClassification,
|
||||||
|
TFXLNetForTokenClassification,
|
||||||
TFXLNetForQuestionAnsweringSimple,
|
TFXLNetForQuestionAnsweringSimple,
|
||||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
@@ -151,6 +170,7 @@ if is_tf_available():
|
|||||||
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
||||||
TFDistilBertModel, TFDistilBertForMaskedLM,
|
TFDistilBertModel, TFDistilBertForMaskedLM,
|
||||||
TFDistilBertForSequenceClassification,
|
TFDistilBertForSequenceClassification,
|
||||||
|
TFDistilBertForTokenClassification,
|
||||||
TFDistilBertForQuestionAnswering,
|
TFDistilBertForQuestionAnswering,
|
||||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
@@ -158,9 +178,16 @@ if is_tf_available():
|
|||||||
TFCTRLLMHeadModel,
|
TFCTRLLMHeadModel,
|
||||||
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
|
||||||
|
TFAlbertForSequenceClassification,
|
||||||
|
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
|
from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
|
||||||
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
# Optimization
|
||||||
|
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
|
||||||
|
|
||||||
# TF 2.0 <=> PyTorch conversion utilities
|
# TF 2.0 <=> PyTorch conversion utilities
|
||||||
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
||||||
load_pytorch_checkpoint_in_tf2_model,
|
load_pytorch_checkpoint_in_tf2_model,
|
||||||
|
|||||||
12
transformers/commands/__init__.py
Normal file
12
transformers/commands/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
class BaseTransformersCLICommand(ABC):
|
||||||
|
@staticmethod
|
||||||
|
@abstractmethod
|
||||||
|
def register_subcommand(parser: ArgumentParser):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self):
|
||||||
|
raise NotImplementedError()
|
||||||
165
transformers/commands/user.py
Normal file
165
transformers/commands/user.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
from argparse import ArgumentParser
|
||||||
|
from getpass import getpass
|
||||||
|
import os
|
||||||
|
|
||||||
|
from transformers.commands import BaseTransformersCLICommand
|
||||||
|
from transformers.hf_api import HfApi, HfFolder, HTTPError
|
||||||
|
|
||||||
|
|
||||||
|
class UserCommands(BaseTransformersCLICommand):
|
||||||
|
@staticmethod
|
||||||
|
def register_subcommand(parser: ArgumentParser):
|
||||||
|
login_parser = parser.add_parser('login')
|
||||||
|
login_parser.set_defaults(func=lambda args: LoginCommand(args))
|
||||||
|
whoami_parser = parser.add_parser('whoami')
|
||||||
|
whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
|
||||||
|
logout_parser = parser.add_parser('logout')
|
||||||
|
logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
|
||||||
|
list_parser = parser.add_parser('ls')
|
||||||
|
list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
||||||
|
# upload
|
||||||
|
upload_parser = parser.add_parser('upload')
|
||||||
|
upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
|
||||||
|
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
|
||||||
|
upload_parser.set_defaults(func=lambda args: UploadCommand(args))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ANSI:
|
||||||
|
"""
|
||||||
|
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
||||||
|
"""
|
||||||
|
_bold = u"\u001b[1m"
|
||||||
|
_reset = u"\u001b[0m"
|
||||||
|
@classmethod
|
||||||
|
def bold(cls, s):
|
||||||
|
return "{}{}{}".format(cls._bold, s, cls._reset)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseUserCommand:
|
||||||
|
def __init__(self, args):
|
||||||
|
self.args = args
|
||||||
|
self._api = HfApi()
|
||||||
|
|
||||||
|
|
||||||
|
class LoginCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
print("""
|
||||||
|
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||||
|
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||||
|
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||||
|
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||||
|
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||||
|
|
||||||
|
""")
|
||||||
|
username = input("Username: ")
|
||||||
|
password = getpass()
|
||||||
|
try:
|
||||||
|
token = self._api.login(username, password)
|
||||||
|
except HTTPError as e:
|
||||||
|
# probably invalid credentials, display error message.
|
||||||
|
print(e)
|
||||||
|
exit(1)
|
||||||
|
HfFolder.save_token(token)
|
||||||
|
print("Login successful")
|
||||||
|
print("Your token:", token, "\n")
|
||||||
|
print("Your token has been saved to", HfFolder.path_token)
|
||||||
|
|
||||||
|
|
||||||
|
class WhoamiCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit()
|
||||||
|
try:
|
||||||
|
user = self._api.whoami(token)
|
||||||
|
print(user)
|
||||||
|
except HTTPError as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
class LogoutCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit()
|
||||||
|
HfFolder.delete_token()
|
||||||
|
self._api.logout(token)
|
||||||
|
print("Successfully logged out.")
|
||||||
|
|
||||||
|
|
||||||
|
class ListObjsCommand(BaseUserCommand):
|
||||||
|
def tabulate(self, rows, headers):
|
||||||
|
# type: (List[List[Union[str, int]]], List[str]) -> str
|
||||||
|
"""
|
||||||
|
Inspired by:
|
||||||
|
stackoverflow.com/a/8356620/593036
|
||||||
|
stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
|
||||||
|
"""
|
||||||
|
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
|
||||||
|
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
|
||||||
|
lines = []
|
||||||
|
lines.append(
|
||||||
|
row_format.format(*headers)
|
||||||
|
)
|
||||||
|
lines.append(
|
||||||
|
row_format.format(*["-" * w for w in col_widths])
|
||||||
|
)
|
||||||
|
for row in rows:
|
||||||
|
lines.append(
|
||||||
|
row_format.format(*row)
|
||||||
|
)
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit(1)
|
||||||
|
try:
|
||||||
|
objs = self._api.list_objs(token)
|
||||||
|
except HTTPError as e:
|
||||||
|
print(e)
|
||||||
|
exit(1)
|
||||||
|
if len(objs) == 0:
|
||||||
|
print("No shared file yet")
|
||||||
|
exit()
|
||||||
|
rows = [ [
|
||||||
|
obj.filename,
|
||||||
|
obj.LastModified,
|
||||||
|
obj.ETag,
|
||||||
|
obj.Size
|
||||||
|
] for obj in objs ]
|
||||||
|
print(
|
||||||
|
self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class UploadCommand(BaseUserCommand):
|
||||||
|
def run(self):
|
||||||
|
token = HfFolder.get_token()
|
||||||
|
if token is None:
|
||||||
|
print("Not logged in")
|
||||||
|
exit(1)
|
||||||
|
filepath = os.path.join(os.getcwd(), self.args.file)
|
||||||
|
filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
|
||||||
|
print(
|
||||||
|
"About to upload file {} to S3 under filename {}".format(
|
||||||
|
ANSI.bold(filepath), ANSI.bold(filename)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
choice = input("Proceed? [Y/n] ").lower()
|
||||||
|
if not(choice == "" or choice == "y" or choice == "yes"):
|
||||||
|
print("Abort")
|
||||||
|
exit()
|
||||||
|
print(
|
||||||
|
ANSI.bold("Uploading... This might take a while if file is large")
|
||||||
|
)
|
||||||
|
access_url = self._api.presign_and_upload(
|
||||||
|
token=token, filename=filename, filepath=filepath
|
||||||
|
)
|
||||||
|
print("Your file now lives at:")
|
||||||
|
print(access_url)
|
||||||
100
transformers/configuration_albert.py
Normal file
100
transformers/configuration_albert.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" ALBERT model configuration """
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
|
||||||
|
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
|
||||||
|
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
|
||||||
|
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
|
||||||
|
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
|
||||||
|
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
|
||||||
|
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
|
||||||
|
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
class AlbertConfig(PretrainedConfig):
|
||||||
|
"""Configuration for `AlbertModel`.
|
||||||
|
|
||||||
|
The default settings match the configuration of model `albert_xxlarge`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size_or_config_json_file=30000,
|
||||||
|
embedding_size=128,
|
||||||
|
hidden_size=4096,
|
||||||
|
num_hidden_layers=12,
|
||||||
|
num_hidden_groups=1,
|
||||||
|
num_attention_heads=64,
|
||||||
|
intermediate_size=16384,
|
||||||
|
inner_group_num=1,
|
||||||
|
hidden_act="gelu_new",
|
||||||
|
hidden_dropout_prob=0,
|
||||||
|
attention_probs_dropout_prob=0,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
layer_norm_eps=1e-12, **kwargs):
|
||||||
|
"""Constructs AlbertConfig.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
|
||||||
|
embedding_size: size of voc embeddings.
|
||||||
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
|
num_hidden_groups: Number of group for the hidden layers, parameters in
|
||||||
|
the same group are shared.
|
||||||
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||||
|
layer in the Transformer encoder.
|
||||||
|
inner_group_num: int, number of inner repetition of attention and ffn.
|
||||||
|
down_scale_factor: float, the scale to apply
|
||||||
|
hidden_act: The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler.
|
||||||
|
hidden_dropout_prob: The dropout probability for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
max_position_embeddings: The maximum sequence length that this model might
|
||||||
|
ever be used with. Typically set this to something large just in case
|
||||||
|
(e.g., 512 or 1024 or 2048).
|
||||||
|
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||||
|
`AlbertModel`.
|
||||||
|
initializer_range: The stdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
"""
|
||||||
|
super(AlbertConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
|
self.embedding_size = embedding_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_hidden_groups = num_hidden_groups
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.inner_group_num = inner_group_num
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.type_vocab_size = type_vocab_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
@@ -27,6 +27,8 @@ from .configuration_xlm import XLMConfig
|
|||||||
from .configuration_roberta import RobertaConfig
|
from .configuration_roberta import RobertaConfig
|
||||||
from .configuration_distilbert import DistilBertConfig
|
from .configuration_distilbert import DistilBertConfig
|
||||||
from .configuration_ctrl import CTRLConfig
|
from .configuration_ctrl import CTRLConfig
|
||||||
|
from .configuration_camembert import CamembertConfig
|
||||||
|
from .configuration_albert import AlbertConfig
|
||||||
from .configuration_t5 import T5Config
|
from .configuration_t5 import T5Config
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -44,13 +46,15 @@ class AutoConfig(object):
|
|||||||
The base model class to instantiate is selected as the first pattern matching
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertConfig (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||||
|
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||||
- contains `bert`: BertConfig (Bert model)
|
- contains `bert`: BertConfig (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
- contains `xlm`: XLMConfig (XLM model)
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
|
||||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||||
This class cannot be instantiated using `__init__()` (throw an error).
|
This class cannot be instantiated using `__init__()` (throw an error).
|
||||||
"""
|
"""
|
||||||
@@ -67,13 +71,15 @@ class AutoConfig(object):
|
|||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `t5`: T5Config (T5 model)
|
- contains `t5`: T5Config (T5 model)
|
||||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertConfig (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||||
|
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||||
- contains `bert`: BertConfig (Bert model)
|
- contains `bert`: BertConfig (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
- contains `xlm`: XLMConfig (XLM model)
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
|
||||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||||
Params:
|
Params:
|
||||||
pretrained_model_name_or_path: either:
|
pretrained_model_name_or_path: either:
|
||||||
@@ -94,6 +100,9 @@ class AutoConfig(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -120,6 +129,10 @@ class AutoConfig(object):
|
|||||||
return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
elif 'distilbert' in pretrained_model_name_or_path:
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
|
return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -138,4 +151,4 @@ class AutoConfig(object):
|
|||||||
return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
"'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
|
"'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|||||||
33
transformers/configuration_camembert.py
Normal file
33
transformers/configuration_camembert.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" CamemBERT configuration """
|
||||||
|
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .configuration_roberta import RobertaConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CamembertConfig(RobertaConfig):
|
||||||
|
pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
@@ -27,7 +27,9 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
|
||||||
|
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
|
||||||
|
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
|
|||||||
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
||||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
||||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
|
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
|
||||||
|
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
|
||||||
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
|
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
|
||||||
|
|
||||||
class GPT2Config(PretrainedConfig):
|
class GPT2Config(PretrainedConfig):
|
||||||
|
|||||||
@@ -29,6 +29,8 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|||||||
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
|
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
|
||||||
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
|
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
|
||||||
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
|
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
|
||||||
|
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
|
||||||
|
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -94,6 +94,9 @@ class PretrainedConfig(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -120,6 +123,7 @@ class PretrainedConfig(object):
|
|||||||
"""
|
"""
|
||||||
cache_dir = kwargs.pop('cache_dir', None)
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
force_download = kwargs.pop('force_download', False)
|
force_download = kwargs.pop('force_download', False)
|
||||||
|
resume_download = kwargs.pop('resume_download', False)
|
||||||
proxies = kwargs.pop('proxies', None)
|
proxies = kwargs.pop('proxies', None)
|
||||||
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
||||||
|
|
||||||
@@ -131,7 +135,8 @@ class PretrainedConfig(object):
|
|||||||
config_file = pretrained_model_name_or_path
|
config_file = pretrained_model_name_or_path
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
|
||||||
|
proxies=proxies, resume_download=resume_download)
|
||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||||
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
||||||
|
|||||||
@@ -0,0 +1,67 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Convert ALBERT checkpoint."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
|
||||||
|
# Initialise PyTorch model
|
||||||
|
config = AlbertConfig.from_json_file(albert_config_file)
|
||||||
|
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||||
|
model = AlbertForMaskedLM(config)
|
||||||
|
|
||||||
|
# Load weights from tf checkpoint
|
||||||
|
load_tf_weights_in_albert(model, config, tf_checkpoint_path)
|
||||||
|
|
||||||
|
# Save pytorch-model
|
||||||
|
print("Save PyTorch model to {}".format(pytorch_dump_path))
|
||||||
|
torch.save(model.state_dict(), pytorch_dump_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
## Required parameters
|
||||||
|
parser.add_argument("--tf_checkpoint_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the TensorFlow checkpoint path.")
|
||||||
|
parser.add_argument("--albert_config_file",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "The config json file corresponding to the pre-trained ALBERT model. \n"
|
||||||
|
"This specifies the model architecture.")
|
||||||
|
parser.add_argument("--pytorch_dump_path",
|
||||||
|
default = None,
|
||||||
|
type = str,
|
||||||
|
required = True,
|
||||||
|
help = "Path to the output PyTorch model.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
|
||||||
|
args.albert_config_file,
|
||||||
|
args.pytorch_dump_path)
|
||||||
|
|
||||||
@@ -34,6 +34,7 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
|
|||||||
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@@ -48,6 +49,7 @@ if is_torch_available():
|
|||||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
else:
|
else:
|
||||||
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
@@ -59,6 +61,7 @@ else:
|
|||||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
|
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
||||||
None, None, None, None,
|
None, None, None, None,
|
||||||
None, None,
|
None, None,
|
||||||
@@ -69,6 +72,7 @@ else:
|
|||||||
None, None, None,
|
None, None, None,
|
||||||
None, None, None,
|
None, None, None,
|
||||||
None, None,
|
None, None,
|
||||||
|
None, None,
|
||||||
None, None)
|
None, None)
|
||||||
|
|
||||||
|
|
||||||
@@ -90,6 +94,7 @@ MODEL_CLASSES = {
|
|||||||
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
|
'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
from .processors import InputExample, InputFeatures, DataProcessor
|
from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
|
||||||
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
|
from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
|
||||||
|
from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||||
|
|
||||||
from .metrics import is_sklearn_available
|
from .metrics import is_sklearn_available
|
||||||
if is_sklearn_available():
|
if is_sklearn_available():
|
||||||
from .metrics import glue_compute_metrics
|
from .metrics import glue_compute_metrics, xnli_compute_metrics
|
||||||
|
|||||||
@@ -81,3 +81,11 @@ if _has_sklearn:
|
|||||||
return {"acc": simple_accuracy(preds, labels)}
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
else:
|
else:
|
||||||
raise KeyError(task_name)
|
raise KeyError(task_name)
|
||||||
|
|
||||||
|
|
||||||
|
def xnli_compute_metrics(task_name, preds, labels):
|
||||||
|
assert len(preds) == len(labels)
|
||||||
|
if task_name == "xnli":
|
||||||
|
return {"acc": simple_accuracy(preds, labels)}
|
||||||
|
else:
|
||||||
|
raise KeyError(task_name)
|
||||||
|
|||||||
758
transformers/data/metrics/squad_metrics.py
Normal file
758
transformers/data/metrics/squad_metrics.py
Normal file
@@ -0,0 +1,758 @@
|
|||||||
|
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
|
||||||
|
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
|
||||||
|
|
||||||
|
In addition to basic functionality, we also compute additional statistics and
|
||||||
|
plot precision-recall curves if an additional na_prob.json file is provided.
|
||||||
|
This file is expected to map question ID's to the model's predicted probability
|
||||||
|
that a question is unanswerable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import collections
|
||||||
|
from io import open
|
||||||
|
from tqdm import tqdm
|
||||||
|
import string
|
||||||
|
import re
|
||||||
|
|
||||||
|
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_answer(s):
|
||||||
|
"""Lower text and remove punctuation, articles and extra whitespace."""
|
||||||
|
def remove_articles(text):
|
||||||
|
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
|
||||||
|
return re.sub(regex, ' ', text)
|
||||||
|
|
||||||
|
def white_space_fix(text):
|
||||||
|
return ' '.join(text.split())
|
||||||
|
|
||||||
|
def remove_punc(text):
|
||||||
|
exclude = set(string.punctuation)
|
||||||
|
return ''.join(ch for ch in text if ch not in exclude)
|
||||||
|
|
||||||
|
def lower(text):
|
||||||
|
return text.lower()
|
||||||
|
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
||||||
|
|
||||||
|
|
||||||
|
def get_tokens(s):
|
||||||
|
if not s:
|
||||||
|
return []
|
||||||
|
return normalize_answer(s).split()
|
||||||
|
|
||||||
|
|
||||||
|
def compute_exact(a_gold, a_pred):
|
||||||
|
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_f1(a_gold, a_pred):
|
||||||
|
gold_toks = get_tokens(a_gold)
|
||||||
|
pred_toks = get_tokens(a_pred)
|
||||||
|
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
|
||||||
|
num_same = sum(common.values())
|
||||||
|
if len(gold_toks) == 0 or len(pred_toks) == 0:
|
||||||
|
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
|
||||||
|
return int(gold_toks == pred_toks)
|
||||||
|
if num_same == 0:
|
||||||
|
return 0
|
||||||
|
precision = 1.0 * num_same / len(pred_toks)
|
||||||
|
recall = 1.0 * num_same / len(gold_toks)
|
||||||
|
f1 = (2 * precision * recall) / (precision + recall)
|
||||||
|
return f1
|
||||||
|
|
||||||
|
|
||||||
|
def get_raw_scores(examples, preds):
|
||||||
|
"""
|
||||||
|
Computes the exact and f1 scores from the examples and the model predictions
|
||||||
|
"""
|
||||||
|
exact_scores = {}
|
||||||
|
f1_scores = {}
|
||||||
|
|
||||||
|
for example in examples:
|
||||||
|
qas_id = example.qas_id
|
||||||
|
gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
|
||||||
|
|
||||||
|
if not gold_answers:
|
||||||
|
# For unanswerable questions, only correct answer is empty string
|
||||||
|
gold_answers = ['']
|
||||||
|
|
||||||
|
if qas_id not in preds:
|
||||||
|
print('Missing prediction for %s' % qas_id)
|
||||||
|
continue
|
||||||
|
|
||||||
|
prediction = preds[qas_id]
|
||||||
|
exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
|
||||||
|
f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
|
||||||
|
|
||||||
|
return exact_scores, f1_scores
|
||||||
|
|
||||||
|
|
||||||
|
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
|
||||||
|
new_scores = {}
|
||||||
|
for qid, s in scores.items():
|
||||||
|
pred_na = na_probs[qid] > na_prob_thresh
|
||||||
|
if pred_na:
|
||||||
|
new_scores[qid] = float(not qid_to_has_ans[qid])
|
||||||
|
else:
|
||||||
|
new_scores[qid] = s
|
||||||
|
return new_scores
|
||||||
|
|
||||||
|
|
||||||
|
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
|
||||||
|
if not qid_list:
|
||||||
|
total = len(exact_scores)
|
||||||
|
return collections.OrderedDict([
|
||||||
|
('exact', 100.0 * sum(exact_scores.values()) / total),
|
||||||
|
('f1', 100.0 * sum(f1_scores.values()) / total),
|
||||||
|
('total', total),
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
total = len(qid_list)
|
||||||
|
return collections.OrderedDict([
|
||||||
|
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
|
||||||
|
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
|
||||||
|
('total', total),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def merge_eval(main_eval, new_eval, prefix):
|
||||||
|
for k in new_eval:
|
||||||
|
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
|
||||||
|
|
||||||
|
|
||||||
|
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
|
||||||
|
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
||||||
|
cur_score = num_no_ans
|
||||||
|
best_score = cur_score
|
||||||
|
best_thresh = 0.0
|
||||||
|
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
||||||
|
for i, qid in enumerate(qid_list):
|
||||||
|
if qid not in scores:
|
||||||
|
continue
|
||||||
|
if qid_to_has_ans[qid]:
|
||||||
|
diff = scores[qid]
|
||||||
|
else:
|
||||||
|
if preds[qid]:
|
||||||
|
diff = -1
|
||||||
|
else:
|
||||||
|
diff = 0
|
||||||
|
cur_score += diff
|
||||||
|
if cur_score > best_score:
|
||||||
|
best_score = cur_score
|
||||||
|
best_thresh = na_probs[qid]
|
||||||
|
|
||||||
|
has_ans_score, has_ans_cnt = 0, 0
|
||||||
|
for qid in qid_list:
|
||||||
|
if not qid_to_has_ans[qid]:
|
||||||
|
continue
|
||||||
|
has_ans_cnt += 1
|
||||||
|
|
||||||
|
if qid not in scores:
|
||||||
|
continue
|
||||||
|
has_ans_score += scores[qid]
|
||||||
|
|
||||||
|
return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
|
||||||
|
|
||||||
|
|
||||||
|
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
||||||
|
best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
|
||||||
|
preds, exact_raw, na_probs, qid_to_has_ans)
|
||||||
|
best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
|
||||||
|
preds, f1_raw, na_probs, qid_to_has_ans)
|
||||||
|
main_eval['best_exact'] = best_exact
|
||||||
|
main_eval['best_exact_thresh'] = exact_thresh
|
||||||
|
main_eval['best_f1'] = best_f1
|
||||||
|
main_eval['best_f1_thresh'] = f1_thresh
|
||||||
|
main_eval['has_ans_exact'] = has_ans_exact
|
||||||
|
main_eval['has_ans_f1'] = has_ans_f1
|
||||||
|
|
||||||
|
|
||||||
|
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
|
||||||
|
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
|
||||||
|
cur_score = num_no_ans
|
||||||
|
best_score = cur_score
|
||||||
|
best_thresh = 0.0
|
||||||
|
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
|
||||||
|
for _, qid in enumerate(qid_list):
|
||||||
|
if qid not in scores:
|
||||||
|
continue
|
||||||
|
if qid_to_has_ans[qid]:
|
||||||
|
diff = scores[qid]
|
||||||
|
else:
|
||||||
|
if preds[qid]:
|
||||||
|
diff = -1
|
||||||
|
else:
|
||||||
|
diff = 0
|
||||||
|
cur_score += diff
|
||||||
|
if cur_score > best_score:
|
||||||
|
best_score = cur_score
|
||||||
|
best_thresh = na_probs[qid]
|
||||||
|
return 100.0 * best_score / len(scores), best_thresh
|
||||||
|
|
||||||
|
|
||||||
|
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
|
||||||
|
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
|
||||||
|
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
|
||||||
|
|
||||||
|
main_eval['best_exact'] = best_exact
|
||||||
|
main_eval['best_exact_thresh'] = exact_thresh
|
||||||
|
main_eval['best_f1'] = best_f1
|
||||||
|
main_eval['best_f1_thresh'] = f1_thresh
|
||||||
|
|
||||||
|
|
||||||
|
def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
|
||||||
|
qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
|
||||||
|
has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
|
||||||
|
no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
|
||||||
|
|
||||||
|
if no_answer_probs is None:
|
||||||
|
no_answer_probs = {k: 0.0 for k in preds}
|
||||||
|
|
||||||
|
exact, f1 = get_raw_scores(examples, preds)
|
||||||
|
|
||||||
|
exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
|
||||||
|
f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
|
||||||
|
|
||||||
|
evaluation = make_eval_dict(exact_threshold, f1_threshold)
|
||||||
|
|
||||||
|
if has_answer_qids:
|
||||||
|
has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
|
||||||
|
merge_eval(evaluation, has_ans_eval, 'HasAns')
|
||||||
|
|
||||||
|
if no_answer_qids:
|
||||||
|
no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
|
||||||
|
merge_eval(evaluation, no_ans_eval, 'NoAns')
|
||||||
|
|
||||||
|
if no_answer_probs:
|
||||||
|
find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
|
||||||
|
|
||||||
|
return evaluation
|
||||||
|
|
||||||
|
|
||||||
|
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
|
||||||
|
"""Project the tokenized prediction back to the original text."""
|
||||||
|
|
||||||
|
# When we created the data, we kept track of the alignment between original
|
||||||
|
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
|
||||||
|
# now `orig_text` contains the span of our original text corresponding to the
|
||||||
|
# span that we predicted.
|
||||||
|
#
|
||||||
|
# However, `orig_text` may contain extra characters that we don't want in
|
||||||
|
# our prediction.
|
||||||
|
#
|
||||||
|
# For example, let's say:
|
||||||
|
# pred_text = steve smith
|
||||||
|
# orig_text = Steve Smith's
|
||||||
|
#
|
||||||
|
# We don't want to return `orig_text` because it contains the extra "'s".
|
||||||
|
#
|
||||||
|
# We don't want to return `pred_text` because it's already been normalized
|
||||||
|
# (the SQuAD eval script also does punctuation stripping/lower casing but
|
||||||
|
# our tokenizer does additional normalization like stripping accent
|
||||||
|
# characters).
|
||||||
|
#
|
||||||
|
# What we really want to return is "Steve Smith".
|
||||||
|
#
|
||||||
|
# Therefore, we have to apply a semi-complicated alignment heuristic between
|
||||||
|
# `pred_text` and `orig_text` to get a character-to-character alignment. This
|
||||||
|
# can fail in certain cases in which case we just return `orig_text`.
|
||||||
|
|
||||||
|
def _strip_spaces(text):
|
||||||
|
ns_chars = []
|
||||||
|
ns_to_s_map = collections.OrderedDict()
|
||||||
|
for (i, c) in enumerate(text):
|
||||||
|
if c == " ":
|
||||||
|
continue
|
||||||
|
ns_to_s_map[len(ns_chars)] = i
|
||||||
|
ns_chars.append(c)
|
||||||
|
ns_text = "".join(ns_chars)
|
||||||
|
return (ns_text, ns_to_s_map)
|
||||||
|
|
||||||
|
# We first tokenize `orig_text`, strip whitespace from the result
|
||||||
|
# and `pred_text`, and check if they are the same length. If they are
|
||||||
|
# NOT the same length, the heuristic has failed. If they are the same
|
||||||
|
# length, we assume the characters are one-to-one aligned.
|
||||||
|
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
|
||||||
|
|
||||||
|
tok_text = " ".join(tokenizer.tokenize(orig_text))
|
||||||
|
|
||||||
|
start_position = tok_text.find(pred_text)
|
||||||
|
if start_position == -1:
|
||||||
|
if verbose_logging:
|
||||||
|
logger.info(
|
||||||
|
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
|
||||||
|
return orig_text
|
||||||
|
end_position = start_position + len(pred_text) - 1
|
||||||
|
|
||||||
|
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
|
||||||
|
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
|
||||||
|
|
||||||
|
if len(orig_ns_text) != len(tok_ns_text):
|
||||||
|
if verbose_logging:
|
||||||
|
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
|
||||||
|
orig_ns_text, tok_ns_text)
|
||||||
|
return orig_text
|
||||||
|
|
||||||
|
# We then project the characters in `pred_text` back to `orig_text` using
|
||||||
|
# the character-to-character alignment.
|
||||||
|
tok_s_to_ns_map = {}
|
||||||
|
for (i, tok_index) in tok_ns_to_s_map.items():
|
||||||
|
tok_s_to_ns_map[tok_index] = i
|
||||||
|
|
||||||
|
orig_start_position = None
|
||||||
|
if start_position in tok_s_to_ns_map:
|
||||||
|
ns_start_position = tok_s_to_ns_map[start_position]
|
||||||
|
if ns_start_position in orig_ns_to_s_map:
|
||||||
|
orig_start_position = orig_ns_to_s_map[ns_start_position]
|
||||||
|
|
||||||
|
if orig_start_position is None:
|
||||||
|
if verbose_logging:
|
||||||
|
logger.info("Couldn't map start position")
|
||||||
|
return orig_text
|
||||||
|
|
||||||
|
orig_end_position = None
|
||||||
|
if end_position in tok_s_to_ns_map:
|
||||||
|
ns_end_position = tok_s_to_ns_map[end_position]
|
||||||
|
if ns_end_position in orig_ns_to_s_map:
|
||||||
|
orig_end_position = orig_ns_to_s_map[ns_end_position]
|
||||||
|
|
||||||
|
if orig_end_position is None:
|
||||||
|
if verbose_logging:
|
||||||
|
logger.info("Couldn't map end position")
|
||||||
|
return orig_text
|
||||||
|
|
||||||
|
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
|
||||||
|
return output_text
|
||||||
|
|
||||||
|
|
||||||
|
def _get_best_indexes(logits, n_best_size):
|
||||||
|
"""Get the n-best logits from a list."""
|
||||||
|
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
best_indexes = []
|
||||||
|
for i in range(len(index_and_score)):
|
||||||
|
if i >= n_best_size:
|
||||||
|
break
|
||||||
|
best_indexes.append(index_and_score[i][0])
|
||||||
|
return best_indexes
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_softmax(scores):
|
||||||
|
"""Compute softmax probability over raw logits."""
|
||||||
|
if not scores:
|
||||||
|
return []
|
||||||
|
|
||||||
|
max_score = None
|
||||||
|
for score in scores:
|
||||||
|
if max_score is None or score > max_score:
|
||||||
|
max_score = score
|
||||||
|
|
||||||
|
exp_scores = []
|
||||||
|
total_sum = 0.0
|
||||||
|
for score in scores:
|
||||||
|
x = math.exp(score - max_score)
|
||||||
|
exp_scores.append(x)
|
||||||
|
total_sum += x
|
||||||
|
|
||||||
|
probs = []
|
||||||
|
for score in exp_scores:
|
||||||
|
probs.append(score / total_sum)
|
||||||
|
return probs
|
||||||
|
|
||||||
|
|
||||||
|
def compute_predictions_logits(
|
||||||
|
all_examples,
|
||||||
|
all_features,
|
||||||
|
all_results,
|
||||||
|
n_best_size,
|
||||||
|
max_answer_length,
|
||||||
|
do_lower_case,
|
||||||
|
output_prediction_file,
|
||||||
|
output_nbest_file,
|
||||||
|
output_null_log_odds_file,
|
||||||
|
verbose_logging,
|
||||||
|
version_2_with_negative,
|
||||||
|
null_score_diff_threshold
|
||||||
|
):
|
||||||
|
"""Write final predictions to the json file and log-odds of null if needed."""
|
||||||
|
logger.info("Writing predictions to: %s" % (output_prediction_file))
|
||||||
|
logger.info("Writing nbest to: %s" % (output_nbest_file))
|
||||||
|
|
||||||
|
example_index_to_features = collections.defaultdict(list)
|
||||||
|
for feature in all_features:
|
||||||
|
example_index_to_features[feature.example_index].append(feature)
|
||||||
|
|
||||||
|
unique_id_to_result = {}
|
||||||
|
for result in all_results:
|
||||||
|
unique_id_to_result[result.unique_id] = result
|
||||||
|
|
||||||
|
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||||
|
"PrelimPrediction",
|
||||||
|
["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
|
||||||
|
|
||||||
|
all_predictions = collections.OrderedDict()
|
||||||
|
all_nbest_json = collections.OrderedDict()
|
||||||
|
scores_diff_json = collections.OrderedDict()
|
||||||
|
|
||||||
|
for (example_index, example) in enumerate(all_examples):
|
||||||
|
features = example_index_to_features[example_index]
|
||||||
|
|
||||||
|
prelim_predictions = []
|
||||||
|
# keep track of the minimum score of null start+end of position 0
|
||||||
|
score_null = 1000000 # large and positive
|
||||||
|
min_null_feature_index = 0 # the paragraph slice with min null score
|
||||||
|
null_start_logit = 0 # the start logit at the slice with min null score
|
||||||
|
null_end_logit = 0 # the end logit at the slice with min null score
|
||||||
|
for (feature_index, feature) in enumerate(features):
|
||||||
|
result = unique_id_to_result[feature.unique_id]
|
||||||
|
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
|
||||||
|
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
|
||||||
|
# if we could have irrelevant answers, get the min score of irrelevant
|
||||||
|
if version_2_with_negative:
|
||||||
|
feature_null_score = result.start_logits[0] + result.end_logits[0]
|
||||||
|
if feature_null_score < score_null:
|
||||||
|
score_null = feature_null_score
|
||||||
|
min_null_feature_index = feature_index
|
||||||
|
null_start_logit = result.start_logits[0]
|
||||||
|
null_end_logit = result.end_logits[0]
|
||||||
|
for start_index in start_indexes:
|
||||||
|
for end_index in end_indexes:
|
||||||
|
# We could hypothetically create invalid predictions, e.g., predict
|
||||||
|
# that the start of the span is in the question. We throw out all
|
||||||
|
# invalid predictions.
|
||||||
|
if start_index >= len(feature.tokens):
|
||||||
|
continue
|
||||||
|
if end_index >= len(feature.tokens):
|
||||||
|
continue
|
||||||
|
if start_index not in feature.token_to_orig_map:
|
||||||
|
continue
|
||||||
|
if end_index not in feature.token_to_orig_map:
|
||||||
|
continue
|
||||||
|
if not feature.token_is_max_context.get(start_index, False):
|
||||||
|
continue
|
||||||
|
if end_index < start_index:
|
||||||
|
continue
|
||||||
|
length = end_index - start_index + 1
|
||||||
|
if length > max_answer_length:
|
||||||
|
continue
|
||||||
|
prelim_predictions.append(
|
||||||
|
_PrelimPrediction(
|
||||||
|
feature_index=feature_index,
|
||||||
|
start_index=start_index,
|
||||||
|
end_index=end_index,
|
||||||
|
start_logit=result.start_logits[start_index],
|
||||||
|
end_logit=result.end_logits[end_index]))
|
||||||
|
if version_2_with_negative:
|
||||||
|
prelim_predictions.append(
|
||||||
|
_PrelimPrediction(
|
||||||
|
feature_index=min_null_feature_index,
|
||||||
|
start_index=0,
|
||||||
|
end_index=0,
|
||||||
|
start_logit=null_start_logit,
|
||||||
|
end_logit=null_end_logit))
|
||||||
|
prelim_predictions = sorted(
|
||||||
|
prelim_predictions,
|
||||||
|
key=lambda x: (x.start_logit + x.end_logit),
|
||||||
|
reverse=True)
|
||||||
|
|
||||||
|
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||||
|
"NbestPrediction", ["text", "start_logit", "end_logit"])
|
||||||
|
|
||||||
|
seen_predictions = {}
|
||||||
|
nbest = []
|
||||||
|
for pred in prelim_predictions:
|
||||||
|
if len(nbest) >= n_best_size:
|
||||||
|
break
|
||||||
|
feature = features[pred.feature_index]
|
||||||
|
if pred.start_index > 0: # this is a non-null prediction
|
||||||
|
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
|
||||||
|
orig_doc_start = feature.token_to_orig_map[pred.start_index]
|
||||||
|
orig_doc_end = feature.token_to_orig_map[pred.end_index]
|
||||||
|
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
|
||||||
|
tok_text = " ".join(tok_tokens)
|
||||||
|
|
||||||
|
# De-tokenize WordPieces that have been split off.
|
||||||
|
tok_text = tok_text.replace(" ##", "")
|
||||||
|
tok_text = tok_text.replace("##", "")
|
||||||
|
|
||||||
|
# Clean whitespace
|
||||||
|
tok_text = tok_text.strip()
|
||||||
|
tok_text = " ".join(tok_text.split())
|
||||||
|
orig_text = " ".join(orig_tokens)
|
||||||
|
|
||||||
|
final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
|
||||||
|
if final_text in seen_predictions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_predictions[final_text] = True
|
||||||
|
else:
|
||||||
|
final_text = ""
|
||||||
|
seen_predictions[final_text] = True
|
||||||
|
|
||||||
|
nbest.append(
|
||||||
|
_NbestPrediction(
|
||||||
|
text=final_text,
|
||||||
|
start_logit=pred.start_logit,
|
||||||
|
end_logit=pred.end_logit))
|
||||||
|
# if we didn't include the empty option in the n-best, include it
|
||||||
|
if version_2_with_negative:
|
||||||
|
if "" not in seen_predictions:
|
||||||
|
nbest.append(
|
||||||
|
_NbestPrediction(
|
||||||
|
text="",
|
||||||
|
start_logit=null_start_logit,
|
||||||
|
end_logit=null_end_logit))
|
||||||
|
|
||||||
|
# In very rare edge cases we could only have single null prediction.
|
||||||
|
# So we just create a nonce prediction in this case to avoid failure.
|
||||||
|
if len(nbest) == 1:
|
||||||
|
nbest.insert(0,
|
||||||
|
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
|
||||||
|
|
||||||
|
# In very rare edge cases we could have no valid predictions. So we
|
||||||
|
# just create a nonce prediction in this case to avoid failure.
|
||||||
|
if not nbest:
|
||||||
|
nbest.append(
|
||||||
|
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
|
||||||
|
|
||||||
|
assert len(nbest) >= 1
|
||||||
|
|
||||||
|
total_scores = []
|
||||||
|
best_non_null_entry = None
|
||||||
|
for entry in nbest:
|
||||||
|
total_scores.append(entry.start_logit + entry.end_logit)
|
||||||
|
if not best_non_null_entry:
|
||||||
|
if entry.text:
|
||||||
|
best_non_null_entry = entry
|
||||||
|
|
||||||
|
probs = _compute_softmax(total_scores)
|
||||||
|
|
||||||
|
nbest_json = []
|
||||||
|
for (i, entry) in enumerate(nbest):
|
||||||
|
output = collections.OrderedDict()
|
||||||
|
output["text"] = entry.text
|
||||||
|
output["probability"] = probs[i]
|
||||||
|
output["start_logit"] = entry.start_logit
|
||||||
|
output["end_logit"] = entry.end_logit
|
||||||
|
nbest_json.append(output)
|
||||||
|
|
||||||
|
assert len(nbest_json) >= 1
|
||||||
|
|
||||||
|
if not version_2_with_negative:
|
||||||
|
all_predictions[example.qas_id] = nbest_json[0]["text"]
|
||||||
|
else:
|
||||||
|
# predict "" iff the null score - the score of best non-null > threshold
|
||||||
|
score_diff = score_null - best_non_null_entry.start_logit - (
|
||||||
|
best_non_null_entry.end_logit)
|
||||||
|
scores_diff_json[example.qas_id] = score_diff
|
||||||
|
if score_diff > null_score_diff_threshold:
|
||||||
|
all_predictions[example.qas_id] = ""
|
||||||
|
else:
|
||||||
|
all_predictions[example.qas_id] = best_non_null_entry.text
|
||||||
|
all_nbest_json[example.qas_id] = nbest_json
|
||||||
|
|
||||||
|
with open(output_prediction_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(all_predictions, indent=4) + "\n")
|
||||||
|
|
||||||
|
with open(output_nbest_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
|
||||||
|
|
||||||
|
if version_2_with_negative:
|
||||||
|
with open(output_null_log_odds_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
|
||||||
|
|
||||||
|
return all_predictions
|
||||||
|
|
||||||
|
|
||||||
|
def compute_predictions_log_probs(
|
||||||
|
all_examples,
|
||||||
|
all_features,
|
||||||
|
all_results,
|
||||||
|
n_best_size,
|
||||||
|
max_answer_length,
|
||||||
|
output_prediction_file,
|
||||||
|
output_nbest_file,
|
||||||
|
output_null_log_odds_file,
|
||||||
|
start_n_top,
|
||||||
|
end_n_top,
|
||||||
|
version_2_with_negative,
|
||||||
|
tokenizer,
|
||||||
|
verbose_logging
|
||||||
|
):
|
||||||
|
""" XLNet write prediction logic (more complex than Bert's).
|
||||||
|
Write final predictions to the json file and log-odds of null if needed.
|
||||||
|
|
||||||
|
Requires utils_squad_evaluate.py
|
||||||
|
"""
|
||||||
|
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||||
|
"PrelimPrediction",
|
||||||
|
["feature_index", "start_index", "end_index",
|
||||||
|
"start_log_prob", "end_log_prob"])
|
||||||
|
|
||||||
|
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
|
||||||
|
"NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
|
||||||
|
|
||||||
|
logger.info("Writing predictions to: %s", output_prediction_file)
|
||||||
|
# logger.info("Writing nbest to: %s" % (output_nbest_file))
|
||||||
|
|
||||||
|
example_index_to_features = collections.defaultdict(list)
|
||||||
|
for feature in all_features:
|
||||||
|
example_index_to_features[feature.example_index].append(feature)
|
||||||
|
|
||||||
|
unique_id_to_result = {}
|
||||||
|
for result in all_results:
|
||||||
|
unique_id_to_result[result.unique_id] = result
|
||||||
|
|
||||||
|
all_predictions = collections.OrderedDict()
|
||||||
|
all_nbest_json = collections.OrderedDict()
|
||||||
|
scores_diff_json = collections.OrderedDict()
|
||||||
|
|
||||||
|
for (example_index, example) in enumerate(all_examples):
|
||||||
|
features = example_index_to_features[example_index]
|
||||||
|
|
||||||
|
prelim_predictions = []
|
||||||
|
# keep track of the minimum score of null start+end of position 0
|
||||||
|
score_null = 1000000 # large and positive
|
||||||
|
|
||||||
|
for (feature_index, feature) in enumerate(features):
|
||||||
|
result = unique_id_to_result[feature.unique_id]
|
||||||
|
|
||||||
|
cur_null_score = result.cls_logits
|
||||||
|
|
||||||
|
# if we could have irrelevant answers, get the min score of irrelevant
|
||||||
|
score_null = min(score_null, cur_null_score)
|
||||||
|
|
||||||
|
for i in range(start_n_top):
|
||||||
|
for j in range(end_n_top):
|
||||||
|
start_log_prob = result.start_logits[i]
|
||||||
|
start_index = result.start_top_index[i]
|
||||||
|
|
||||||
|
j_index = i * end_n_top + j
|
||||||
|
|
||||||
|
end_log_prob = result.end_logits[j_index]
|
||||||
|
end_index = result.end_top_index[j_index]
|
||||||
|
|
||||||
|
# We could hypothetically create invalid predictions, e.g., predict
|
||||||
|
# that the start of the span is in the question. We throw out all
|
||||||
|
# invalid predictions.
|
||||||
|
if start_index >= feature.paragraph_len - 1:
|
||||||
|
continue
|
||||||
|
if end_index >= feature.paragraph_len - 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not feature.token_is_max_context.get(start_index, False):
|
||||||
|
continue
|
||||||
|
if end_index < start_index:
|
||||||
|
continue
|
||||||
|
length = end_index - start_index + 1
|
||||||
|
if length > max_answer_length:
|
||||||
|
continue
|
||||||
|
|
||||||
|
prelim_predictions.append(
|
||||||
|
_PrelimPrediction(
|
||||||
|
feature_index=feature_index,
|
||||||
|
start_index=start_index,
|
||||||
|
end_index=end_index,
|
||||||
|
start_log_prob=start_log_prob,
|
||||||
|
end_log_prob=end_log_prob))
|
||||||
|
|
||||||
|
prelim_predictions = sorted(
|
||||||
|
prelim_predictions,
|
||||||
|
key=lambda x: (x.start_log_prob + x.end_log_prob),
|
||||||
|
reverse=True)
|
||||||
|
|
||||||
|
seen_predictions = {}
|
||||||
|
nbest = []
|
||||||
|
for pred in prelim_predictions:
|
||||||
|
if len(nbest) >= n_best_size:
|
||||||
|
break
|
||||||
|
feature = features[pred.feature_index]
|
||||||
|
|
||||||
|
# XLNet un-tokenizer
|
||||||
|
# Let's keep it simple for now and see if we need all this later.
|
||||||
|
#
|
||||||
|
# tok_start_to_orig_index = feature.tok_start_to_orig_index
|
||||||
|
# tok_end_to_orig_index = feature.tok_end_to_orig_index
|
||||||
|
# start_orig_pos = tok_start_to_orig_index[pred.start_index]
|
||||||
|
# end_orig_pos = tok_end_to_orig_index[pred.end_index]
|
||||||
|
# paragraph_text = example.paragraph_text
|
||||||
|
# final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
|
||||||
|
|
||||||
|
# Previously used Bert untokenizer
|
||||||
|
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
|
||||||
|
orig_doc_start = feature.token_to_orig_map[pred.start_index]
|
||||||
|
orig_doc_end = feature.token_to_orig_map[pred.end_index]
|
||||||
|
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
|
||||||
|
tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
|
||||||
|
|
||||||
|
# Clean whitespace
|
||||||
|
tok_text = tok_text.strip()
|
||||||
|
tok_text = " ".join(tok_text.split())
|
||||||
|
orig_text = " ".join(orig_tokens)
|
||||||
|
|
||||||
|
final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
|
||||||
|
verbose_logging)
|
||||||
|
|
||||||
|
if final_text in seen_predictions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_predictions[final_text] = True
|
||||||
|
|
||||||
|
nbest.append(
|
||||||
|
_NbestPrediction(
|
||||||
|
text=final_text,
|
||||||
|
start_log_prob=pred.start_log_prob,
|
||||||
|
end_log_prob=pred.end_log_prob))
|
||||||
|
|
||||||
|
# In very rare edge cases we could have no valid predictions. So we
|
||||||
|
# just create a nonce prediction in this case to avoid failure.
|
||||||
|
if not nbest:
|
||||||
|
nbest.append(
|
||||||
|
_NbestPrediction(text="", start_log_prob=-1e6,
|
||||||
|
end_log_prob=-1e6))
|
||||||
|
|
||||||
|
total_scores = []
|
||||||
|
best_non_null_entry = None
|
||||||
|
for entry in nbest:
|
||||||
|
total_scores.append(entry.start_log_prob + entry.end_log_prob)
|
||||||
|
if not best_non_null_entry:
|
||||||
|
best_non_null_entry = entry
|
||||||
|
|
||||||
|
probs = _compute_softmax(total_scores)
|
||||||
|
|
||||||
|
nbest_json = []
|
||||||
|
for (i, entry) in enumerate(nbest):
|
||||||
|
output = collections.OrderedDict()
|
||||||
|
output["text"] = entry.text
|
||||||
|
output["probability"] = probs[i]
|
||||||
|
output["start_log_prob"] = entry.start_log_prob
|
||||||
|
output["end_log_prob"] = entry.end_log_prob
|
||||||
|
nbest_json.append(output)
|
||||||
|
|
||||||
|
assert len(nbest_json) >= 1
|
||||||
|
assert best_non_null_entry is not None
|
||||||
|
|
||||||
|
score_diff = score_null
|
||||||
|
scores_diff_json[example.qas_id] = score_diff
|
||||||
|
# note(zhiliny): always predict best_non_null_entry
|
||||||
|
# and the evaluation script will search for the best threshold
|
||||||
|
all_predictions[example.qas_id] = best_non_null_entry.text
|
||||||
|
|
||||||
|
all_nbest_json[example.qas_id] = nbest_json
|
||||||
|
|
||||||
|
with open(output_prediction_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(all_predictions, indent=4) + "\n")
|
||||||
|
|
||||||
|
with open(output_nbest_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
|
||||||
|
|
||||||
|
if version_2_with_negative:
|
||||||
|
with open(output_null_log_odds_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
|
||||||
|
|
||||||
|
return all_predictions
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
from .utils import InputExample, InputFeatures, DataProcessor
|
from .utils import InputExample, InputFeatures, DataProcessor
|
||||||
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||||
|
from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
|
||||||
|
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||||
585
transformers/data/processors/squad.py
Normal file
585
transformers/data/processors/squad.py
Normal file
@@ -0,0 +1,585 @@
|
|||||||
|
from tqdm import tqdm
|
||||||
|
import collections
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||||
|
from .utils import DataProcessor, InputExample, InputFeatures
|
||||||
|
from ...file_utils import is_tf_available, is_torch_available
|
||||||
|
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import TensorDataset
|
||||||
|
|
||||||
|
if is_tf_available():
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
|
||||||
|
orig_answer_text):
|
||||||
|
"""Returns tokenized answer spans that better match the annotated answer."""
|
||||||
|
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
|
||||||
|
|
||||||
|
for new_start in range(input_start, input_end + 1):
|
||||||
|
for new_end in range(input_end, new_start - 1, -1):
|
||||||
|
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
|
||||||
|
if text_span == tok_answer_text:
|
||||||
|
return (new_start, new_end)
|
||||||
|
|
||||||
|
return (input_start, input_end)
|
||||||
|
|
||||||
|
def _check_is_max_context(doc_spans, cur_span_index, position):
|
||||||
|
"""Check if this is the 'max context' doc span for the token."""
|
||||||
|
best_score = None
|
||||||
|
best_span_index = None
|
||||||
|
for (span_index, doc_span) in enumerate(doc_spans):
|
||||||
|
end = doc_span.start + doc_span.length - 1
|
||||||
|
if position < doc_span.start:
|
||||||
|
continue
|
||||||
|
if position > end:
|
||||||
|
continue
|
||||||
|
num_left_context = position - doc_span.start
|
||||||
|
num_right_context = end - position
|
||||||
|
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
|
||||||
|
if best_score is None or score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_span_index = span_index
|
||||||
|
|
||||||
|
return cur_span_index == best_span_index
|
||||||
|
|
||||||
|
def _new_check_is_max_context(doc_spans, cur_span_index, position):
|
||||||
|
"""Check if this is the 'max context' doc span for the token."""
|
||||||
|
# if len(doc_spans) == 1:
|
||||||
|
# return True
|
||||||
|
best_score = None
|
||||||
|
best_span_index = None
|
||||||
|
for (span_index, doc_span) in enumerate(doc_spans):
|
||||||
|
end = doc_span["start"] + doc_span["length"] - 1
|
||||||
|
if position < doc_span["start"]:
|
||||||
|
continue
|
||||||
|
if position > end:
|
||||||
|
continue
|
||||||
|
num_left_context = position - doc_span["start"]
|
||||||
|
num_right_context = end - position
|
||||||
|
score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
|
||||||
|
if best_score is None or score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_span_index = span_index
|
||||||
|
|
||||||
|
return cur_span_index == best_span_index
|
||||||
|
|
||||||
|
def _is_whitespace(c):
|
||||||
|
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||||
|
doc_stride, max_query_length, is_training,
|
||||||
|
return_dataset=False):
|
||||||
|
"""
|
||||||
|
Converts a list of examples into a list of features that can be directly given as input to a model.
|
||||||
|
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
|
||||||
|
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
|
||||||
|
max_seq_length: The maximum sequence length of the inputs.
|
||||||
|
doc_stride: The stride used when the context is too large and is split across several features.
|
||||||
|
max_query_length: The maximum length of the query.
|
||||||
|
is_training: whether to create features for model evaluation or model training.
|
||||||
|
return_dataset: Default False. Either 'pt' or 'tf'.
|
||||||
|
if 'pt': returns a torch.data.TensorDataset,
|
||||||
|
if 'tf': returns a tf.data.Dataset
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of :class:`~transformers.data.processors.squad.SquadFeatures`
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
processor = SquadV2Processor()
|
||||||
|
examples = processor.get_dev_examples(data_dir)
|
||||||
|
|
||||||
|
features = squad_convert_examples_to_features(
|
||||||
|
examples=examples,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
max_seq_length=args.max_seq_length,
|
||||||
|
doc_stride=args.doc_stride,
|
||||||
|
max_query_length=args.max_query_length,
|
||||||
|
is_training=not evaluate,
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Defining helper methods
|
||||||
|
unique_id = 1000000000
|
||||||
|
|
||||||
|
features = []
|
||||||
|
for (example_index, example) in enumerate(tqdm(examples)):
|
||||||
|
if is_training and not example.is_impossible:
|
||||||
|
# Get start and end position
|
||||||
|
start_position = example.start_position
|
||||||
|
end_position = example.end_position
|
||||||
|
|
||||||
|
# If the answer cannot be found in the text, then skip this example.
|
||||||
|
actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
|
||||||
|
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
|
||||||
|
if actual_text.find(cleaned_answer_text) == -1:
|
||||||
|
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
tok_to_orig_index = []
|
||||||
|
orig_to_tok_index = []
|
||||||
|
all_doc_tokens = []
|
||||||
|
for (i, token) in enumerate(example.doc_tokens):
|
||||||
|
orig_to_tok_index.append(len(all_doc_tokens))
|
||||||
|
sub_tokens = tokenizer.tokenize(token)
|
||||||
|
for sub_token in sub_tokens:
|
||||||
|
tok_to_orig_index.append(i)
|
||||||
|
all_doc_tokens.append(sub_token)
|
||||||
|
|
||||||
|
|
||||||
|
if is_training and not example.is_impossible:
|
||||||
|
tok_start_position = orig_to_tok_index[example.start_position]
|
||||||
|
if example.end_position < len(example.doc_tokens) - 1:
|
||||||
|
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
|
||||||
|
else:
|
||||||
|
tok_end_position = len(all_doc_tokens) - 1
|
||||||
|
|
||||||
|
(tok_start_position, tok_end_position) = _improve_answer_span(
|
||||||
|
all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
|
||||||
|
)
|
||||||
|
|
||||||
|
spans = []
|
||||||
|
|
||||||
|
truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
|
||||||
|
sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
|
||||||
|
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
|
||||||
|
|
||||||
|
span_doc_tokens = all_doc_tokens
|
||||||
|
while len(spans) * doc_stride < len(all_doc_tokens):
|
||||||
|
|
||||||
|
encoded_dict = tokenizer.encode_plus(
|
||||||
|
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
|
||||||
|
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
|
||||||
|
max_length=max_seq_length,
|
||||||
|
return_overflowing_tokens=True,
|
||||||
|
pad_to_max_length=True,
|
||||||
|
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
|
||||||
|
truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
|
||||||
|
)
|
||||||
|
|
||||||
|
paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
|
||||||
|
|
||||||
|
if tokenizer.pad_token_id in encoded_dict['input_ids']:
|
||||||
|
non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
|
||||||
|
else:
|
||||||
|
non_padded_ids = encoded_dict['input_ids']
|
||||||
|
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
|
||||||
|
|
||||||
|
token_to_orig_map = {}
|
||||||
|
for i in range(paragraph_len):
|
||||||
|
index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
|
||||||
|
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
|
||||||
|
|
||||||
|
encoded_dict["paragraph_len"] = paragraph_len
|
||||||
|
encoded_dict["tokens"] = tokens
|
||||||
|
encoded_dict["token_to_orig_map"] = token_to_orig_map
|
||||||
|
encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
|
||||||
|
encoded_dict["token_is_max_context"] = {}
|
||||||
|
encoded_dict["start"] = len(spans) * doc_stride
|
||||||
|
encoded_dict["length"] = paragraph_len
|
||||||
|
|
||||||
|
spans.append(encoded_dict)
|
||||||
|
|
||||||
|
if "overflowing_tokens" not in encoded_dict:
|
||||||
|
break
|
||||||
|
span_doc_tokens = encoded_dict["overflowing_tokens"]
|
||||||
|
|
||||||
|
for doc_span_index in range(len(spans)):
|
||||||
|
for j in range(spans[doc_span_index]["paragraph_len"]):
|
||||||
|
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
|
||||||
|
index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
|
||||||
|
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
|
||||||
|
|
||||||
|
for span in spans:
|
||||||
|
# Identify the position of the CLS token
|
||||||
|
cls_index = span['input_ids'].index(tokenizer.cls_token_id)
|
||||||
|
|
||||||
|
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
|
||||||
|
# Original TF implem also keep the classification token (set to 0) (not sure why...)
|
||||||
|
p_mask = np.array(span['token_type_ids'])
|
||||||
|
|
||||||
|
p_mask = np.minimum(p_mask, 1)
|
||||||
|
|
||||||
|
if tokenizer.padding_side == "right":
|
||||||
|
# Limit positive values to one
|
||||||
|
p_mask = 1 - p_mask
|
||||||
|
|
||||||
|
p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
|
||||||
|
|
||||||
|
# Set the CLS index to '0'
|
||||||
|
p_mask[cls_index] = 0
|
||||||
|
|
||||||
|
|
||||||
|
span_is_impossible = example.is_impossible
|
||||||
|
start_position = 0
|
||||||
|
end_position = 0
|
||||||
|
if is_training and not span_is_impossible:
|
||||||
|
# For training, if our document chunk does not contain an annotation
|
||||||
|
# we throw it out, since there is nothing to predict.
|
||||||
|
doc_start = span["start"]
|
||||||
|
doc_end = span["start"] + span["length"] - 1
|
||||||
|
out_of_span = False
|
||||||
|
|
||||||
|
if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
|
||||||
|
out_of_span = True
|
||||||
|
|
||||||
|
if out_of_span:
|
||||||
|
start_position = cls_index
|
||||||
|
end_position = cls_index
|
||||||
|
span_is_impossible = True
|
||||||
|
else:
|
||||||
|
if tokenizer.padding_side == "left":
|
||||||
|
doc_offset = 0
|
||||||
|
else:
|
||||||
|
doc_offset = len(truncated_query) + sequence_added_tokens
|
||||||
|
|
||||||
|
start_position = tok_start_position - doc_start + doc_offset
|
||||||
|
end_position = tok_end_position - doc_start + doc_offset
|
||||||
|
|
||||||
|
|
||||||
|
features.append(SquadFeatures(
|
||||||
|
span['input_ids'],
|
||||||
|
span['attention_mask'],
|
||||||
|
span['token_type_ids'],
|
||||||
|
cls_index,
|
||||||
|
p_mask.tolist(),
|
||||||
|
|
||||||
|
example_index=example_index,
|
||||||
|
unique_id=unique_id,
|
||||||
|
paragraph_len=span['paragraph_len'],
|
||||||
|
token_is_max_context=span["token_is_max_context"],
|
||||||
|
tokens=span["tokens"],
|
||||||
|
token_to_orig_map=span["token_to_orig_map"],
|
||||||
|
|
||||||
|
start_position=start_position,
|
||||||
|
end_position=end_position
|
||||||
|
))
|
||||||
|
|
||||||
|
unique_id += 1
|
||||||
|
|
||||||
|
if return_dataset == 'pt':
|
||||||
|
if not is_torch_available():
|
||||||
|
raise ImportError("Pytorch must be installed to return a pytorch dataset.")
|
||||||
|
|
||||||
|
# Convert to Tensors and build dataset
|
||||||
|
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||||
|
all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||||
|
all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||||
|
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||||
|
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||||
|
|
||||||
|
if not is_training:
|
||||||
|
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
||||||
|
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||||
|
all_example_index, all_cls_index, all_p_mask)
|
||||||
|
else:
|
||||||
|
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||||
|
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||||
|
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||||
|
all_start_positions, all_end_positions,
|
||||||
|
all_cls_index, all_p_mask)
|
||||||
|
|
||||||
|
return features, dataset
|
||||||
|
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
|
class SquadProcessor(DataProcessor):
|
||||||
|
"""
|
||||||
|
Processor for the SQuAD data set.
|
||||||
|
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
|
||||||
|
"""
|
||||||
|
train_file = None
|
||||||
|
dev_file = None
|
||||||
|
|
||||||
|
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
|
||||||
|
if not evaluate:
|
||||||
|
answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
|
||||||
|
answer_start = tensor_dict['answers']['answer_start'][0].numpy()
|
||||||
|
answers = []
|
||||||
|
else:
|
||||||
|
answers = [{
|
||||||
|
"answer_start": start.numpy(),
|
||||||
|
"text": text.numpy().decode('utf-8')
|
||||||
|
} for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
|
||||||
|
|
||||||
|
answer = None
|
||||||
|
answer_start = None
|
||||||
|
|
||||||
|
return SquadExample(
|
||||||
|
qas_id=tensor_dict['id'].numpy().decode("utf-8"),
|
||||||
|
question_text=tensor_dict['question'].numpy().decode('utf-8'),
|
||||||
|
context_text=tensor_dict['context'].numpy().decode('utf-8'),
|
||||||
|
answer_text=answer,
|
||||||
|
start_position_character=answer_start,
|
||||||
|
title=tensor_dict['title'].numpy().decode('utf-8'),
|
||||||
|
answers=answers
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_examples_from_dataset(self, dataset, evaluate=False):
|
||||||
|
"""
|
||||||
|
Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
|
||||||
|
evaluate: boolean specifying if in evaluation mode or in training mode
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of SquadExample
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow_datasets as tfds
|
||||||
|
dataset = tfds.load("squad")
|
||||||
|
|
||||||
|
training_examples = get_examples_from_dataset(dataset, evaluate=False)
|
||||||
|
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
|
||||||
|
"""
|
||||||
|
|
||||||
|
if evaluate:
|
||||||
|
dataset = dataset["validation"]
|
||||||
|
else:
|
||||||
|
dataset = dataset["train"]
|
||||||
|
|
||||||
|
examples = []
|
||||||
|
for tensor_dict in tqdm(dataset):
|
||||||
|
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
|
||||||
|
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def get_train_examples(self, data_dir, filename=None):
|
||||||
|
"""
|
||||||
|
Returns the training examples from the data directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dir: Directory containing the data files used for training and evaluating.
|
||||||
|
filename: None by default, specify this if the training file has a different name than the original one
|
||||||
|
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self.train_file is None:
|
||||||
|
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||||
|
|
||||||
|
with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
|
||||||
|
input_data = json.load(reader)["data"]
|
||||||
|
return self._create_examples(input_data, "train")
|
||||||
|
|
||||||
|
def get_dev_examples(self, data_dir, filename=None):
|
||||||
|
"""
|
||||||
|
Returns the evaluation example from the data directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dir: Directory containing the data files used for training and evaluating.
|
||||||
|
filename: None by default, specify this if the evaluation file has a different name than the original one
|
||||||
|
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
|
||||||
|
"""
|
||||||
|
if self.dev_file is None:
|
||||||
|
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
|
||||||
|
|
||||||
|
with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
|
||||||
|
input_data = json.load(reader)["data"]
|
||||||
|
return self._create_examples(input_data, "dev")
|
||||||
|
|
||||||
|
def _create_examples(self, input_data, set_type):
|
||||||
|
is_training = set_type == "train"
|
||||||
|
examples = []
|
||||||
|
for entry in tqdm(input_data):
|
||||||
|
title = entry['title']
|
||||||
|
for paragraph in entry["paragraphs"]:
|
||||||
|
context_text = paragraph["context"]
|
||||||
|
for qa in paragraph["qas"]:
|
||||||
|
qas_id = qa["id"]
|
||||||
|
question_text = qa["question"]
|
||||||
|
start_position_character = None
|
||||||
|
answer_text = None
|
||||||
|
answers = []
|
||||||
|
|
||||||
|
if "is_impossible" in qa:
|
||||||
|
is_impossible = qa["is_impossible"]
|
||||||
|
else:
|
||||||
|
is_impossible = False
|
||||||
|
|
||||||
|
if not is_impossible:
|
||||||
|
if is_training:
|
||||||
|
answer = qa["answers"][0]
|
||||||
|
answer_text = answer['text']
|
||||||
|
start_position_character = answer['answer_start']
|
||||||
|
else:
|
||||||
|
answers = qa["answers"]
|
||||||
|
|
||||||
|
example = SquadExample(
|
||||||
|
qas_id=qas_id,
|
||||||
|
question_text=question_text,
|
||||||
|
context_text=context_text,
|
||||||
|
answer_text=answer_text,
|
||||||
|
start_position_character=start_position_character,
|
||||||
|
title=title,
|
||||||
|
is_impossible=is_impossible,
|
||||||
|
answers=answers
|
||||||
|
)
|
||||||
|
|
||||||
|
examples.append(example)
|
||||||
|
return examples
|
||||||
|
|
||||||
|
class SquadV1Processor(SquadProcessor):
|
||||||
|
train_file = "train-v1.1.json"
|
||||||
|
dev_file = "dev-v1.1.json"
|
||||||
|
|
||||||
|
|
||||||
|
class SquadV2Processor(SquadProcessor):
|
||||||
|
train_file = "train-v2.0.json"
|
||||||
|
dev_file = "dev-v2.0.json"
|
||||||
|
|
||||||
|
|
||||||
|
class SquadExample(object):
|
||||||
|
"""
|
||||||
|
A single training/test example for the Squad dataset, as loaded from disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
qas_id: The example's unique identifier
|
||||||
|
question_text: The question string
|
||||||
|
context_text: The context string
|
||||||
|
answer_text: The answer string
|
||||||
|
start_position_character: The character position of the start of the answer
|
||||||
|
title: The title of the example
|
||||||
|
answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
|
||||||
|
is_impossible: False by default, set to True if the example has no possible answer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
qas_id,
|
||||||
|
question_text,
|
||||||
|
context_text,
|
||||||
|
answer_text,
|
||||||
|
start_position_character,
|
||||||
|
title,
|
||||||
|
answers=[],
|
||||||
|
is_impossible=False):
|
||||||
|
self.qas_id = qas_id
|
||||||
|
self.question_text = question_text
|
||||||
|
self.context_text = context_text
|
||||||
|
self.answer_text = answer_text
|
||||||
|
self.title = title
|
||||||
|
self.is_impossible = is_impossible
|
||||||
|
self.answers = answers
|
||||||
|
|
||||||
|
self.start_position, self.end_position = 0, 0
|
||||||
|
|
||||||
|
doc_tokens = []
|
||||||
|
char_to_word_offset = []
|
||||||
|
prev_is_whitespace = True
|
||||||
|
|
||||||
|
# Split on whitespace so that different tokens may be attributed to their original position.
|
||||||
|
for c in self.context_text:
|
||||||
|
if _is_whitespace(c):
|
||||||
|
prev_is_whitespace = True
|
||||||
|
else:
|
||||||
|
if prev_is_whitespace:
|
||||||
|
doc_tokens.append(c)
|
||||||
|
else:
|
||||||
|
doc_tokens[-1] += c
|
||||||
|
prev_is_whitespace = False
|
||||||
|
char_to_word_offset.append(len(doc_tokens) - 1)
|
||||||
|
|
||||||
|
self.doc_tokens = doc_tokens
|
||||||
|
self.char_to_word_offset = char_to_word_offset
|
||||||
|
|
||||||
|
# Start end end positions only has a value during evaluation.
|
||||||
|
if start_position_character is not None and not is_impossible:
|
||||||
|
self.start_position = char_to_word_offset[start_position_character]
|
||||||
|
self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
|
||||||
|
|
||||||
|
|
||||||
|
class SquadFeatures(object):
|
||||||
|
"""
|
||||||
|
Single squad example features to be fed to a model.
|
||||||
|
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
|
||||||
|
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||||
|
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||||
|
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
cls_index: the index of the CLS token.
|
||||||
|
p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
|
||||||
|
Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
|
||||||
|
example_index: the index of the example
|
||||||
|
unique_id: The unique Feature identifier
|
||||||
|
paragraph_len: The length of the context
|
||||||
|
token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
|
||||||
|
If a token does not have their maximum context in this feature object, it means that another feature object
|
||||||
|
has more information related to that token and should be prioritized over this feature for that token.
|
||||||
|
tokens: list of tokens corresponding to the input ids
|
||||||
|
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
|
||||||
|
start_position: start of the answer token index
|
||||||
|
end_position: end of the answer token index
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
input_ids,
|
||||||
|
attention_mask,
|
||||||
|
token_type_ids,
|
||||||
|
cls_index,
|
||||||
|
p_mask,
|
||||||
|
|
||||||
|
example_index,
|
||||||
|
unique_id,
|
||||||
|
paragraph_len,
|
||||||
|
token_is_max_context,
|
||||||
|
tokens,
|
||||||
|
token_to_orig_map,
|
||||||
|
|
||||||
|
start_position,
|
||||||
|
end_position
|
||||||
|
):
|
||||||
|
self.input_ids = input_ids
|
||||||
|
self.attention_mask = attention_mask
|
||||||
|
self.token_type_ids = token_type_ids
|
||||||
|
self.cls_index = cls_index
|
||||||
|
self.p_mask = p_mask
|
||||||
|
|
||||||
|
self.example_index = example_index
|
||||||
|
self.unique_id = unique_id
|
||||||
|
self.paragraph_len = paragraph_len
|
||||||
|
self.token_is_max_context = token_is_max_context
|
||||||
|
self.tokens = tokens
|
||||||
|
self.token_to_orig_map = token_to_orig_map
|
||||||
|
|
||||||
|
self.start_position = start_position
|
||||||
|
self.end_position = end_position
|
||||||
|
|
||||||
|
|
||||||
|
class SquadResult(object):
|
||||||
|
"""
|
||||||
|
Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
unique_id: The unique identifier corresponding to that example.
|
||||||
|
start_logits: The logits corresponding to the start of the answer
|
||||||
|
end_logits: The logits corresponding to the end of the answer
|
||||||
|
"""
|
||||||
|
def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
|
||||||
|
self.start_logits = start_logits
|
||||||
|
self.end_logits = end_logits
|
||||||
|
self.unique_id = unique_id
|
||||||
|
|
||||||
|
if start_top_index:
|
||||||
|
self.start_top_index = start_top_index
|
||||||
|
self.end_top_index = end_top_index
|
||||||
|
self.cls_logits = cls_logits
|
||||||
85
transformers/data/processors/xnli.py
Normal file
85
transformers/data/processors/xnli.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" XNLI utils (dataset loading and evaluation) """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from .utils import DataProcessor, InputExample
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class XnliProcessor(DataProcessor):
|
||||||
|
"""Processor for the XNLI dataset.
|
||||||
|
Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
|
||||||
|
|
||||||
|
def __init__(self, language, train_language = None):
|
||||||
|
self.language = language
|
||||||
|
self.train_language = train_language
|
||||||
|
|
||||||
|
def get_train_examples(self, data_dir):
|
||||||
|
"""See base class."""
|
||||||
|
lg = self.language if self.train_language is None else self.train_language
|
||||||
|
lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
|
||||||
|
examples = []
|
||||||
|
for (i, line) in enumerate(lines):
|
||||||
|
if i == 0:
|
||||||
|
continue
|
||||||
|
guid = "%s-%s" % ('train', i)
|
||||||
|
text_a = line[0]
|
||||||
|
text_b = line[1]
|
||||||
|
label = "contradiction" if line[2] == "contradictory" else line[2]
|
||||||
|
assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
|
||||||
|
examples.append(
|
||||||
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def get_test_examples(self, data_dir):
|
||||||
|
"""See base class."""
|
||||||
|
lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
|
||||||
|
examples = []
|
||||||
|
for (i, line) in enumerate(lines):
|
||||||
|
if i == 0:
|
||||||
|
continue
|
||||||
|
language = line[0]
|
||||||
|
if language != self.language:
|
||||||
|
continue
|
||||||
|
guid = "%s-%s" % ('test', i)
|
||||||
|
text_a = line[6]
|
||||||
|
text_b = line[7]
|
||||||
|
label = line[1]
|
||||||
|
assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
|
||||||
|
examples.append(
|
||||||
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
|
return examples
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
"""See base class."""
|
||||||
|
return ["contradiction", "entailment", "neutral"]
|
||||||
|
|
||||||
|
xnli_processors = {
|
||||||
|
"xnli": XnliProcessor,
|
||||||
|
}
|
||||||
|
|
||||||
|
xnli_output_modes = {
|
||||||
|
"xnli": "classification",
|
||||||
|
}
|
||||||
|
|
||||||
|
xnli_tasks_num_labels = {
|
||||||
|
"xnli": 3,
|
||||||
|
}
|
||||||
@@ -22,6 +22,7 @@ from botocore.config import Config
|
|||||||
from botocore.exceptions import ClientError
|
from botocore.exceptions import ClientError
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
@@ -152,7 +153,7 @@ def filename_to_url(filename, cache_dir=None):
|
|||||||
return url, etag
|
return url, etag
|
||||||
|
|
||||||
|
|
||||||
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
|
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False):
|
||||||
"""
|
"""
|
||||||
Given something that might be a URL (or might be a local path),
|
Given something that might be a URL (or might be a local path),
|
||||||
determine which. If it's a URL, download the file and cache it, and
|
determine which. If it's a URL, download the file and cache it, and
|
||||||
@@ -161,6 +162,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
Args:
|
Args:
|
||||||
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
||||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
||||||
|
resume_download: if True, resume the download if incompletly recieved file is found.
|
||||||
"""
|
"""
|
||||||
if cache_dir is None:
|
if cache_dir is None:
|
||||||
cache_dir = TRANSFORMERS_CACHE
|
cache_dir = TRANSFORMERS_CACHE
|
||||||
@@ -173,7 +175,9 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
|||||||
|
|
||||||
if parsed.scheme in ('http', 'https', 's3'):
|
if parsed.scheme in ('http', 'https', 's3'):
|
||||||
# URL, so get it from the cache (downloading if necessary)
|
# URL, so get it from the cache (downloading if necessary)
|
||||||
return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
return get_from_cache(url_or_filename, cache_dir=cache_dir,
|
||||||
|
force_download=force_download, proxies=proxies,
|
||||||
|
resume_download=resume_download)
|
||||||
elif os.path.exists(url_or_filename):
|
elif os.path.exists(url_or_filename):
|
||||||
# File, and it exists.
|
# File, and it exists.
|
||||||
return url_or_filename
|
return url_or_filename
|
||||||
@@ -234,19 +238,22 @@ def s3_get(url, temp_file, proxies=None):
|
|||||||
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
||||||
|
|
||||||
|
|
||||||
def http_get(url, temp_file, proxies=None):
|
def http_get(url, temp_file, proxies=None, resume_size=0):
|
||||||
req = requests.get(url, stream=True, proxies=proxies)
|
headers={'Range':'bytes=%d-'%(resume_size,)} if resume_size > 0 else None
|
||||||
content_length = req.headers.get('Content-Length')
|
response = requests.get(url, stream=True, proxies=proxies, headers=headers)
|
||||||
total = int(content_length) if content_length is not None else None
|
if response.status_code == 416: # Range not satisfiable
|
||||||
progress = tqdm(unit="B", total=total)
|
return
|
||||||
for chunk in req.iter_content(chunk_size=1024):
|
content_length = response.headers.get('Content-Length')
|
||||||
|
total = resume_size + int(content_length) if content_length is not None else None
|
||||||
|
progress = tqdm(unit="B", total=total, initial=resume_size)
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
if chunk: # filter out keep-alive new chunks
|
if chunk: # filter out keep-alive new chunks
|
||||||
progress.update(len(chunk))
|
progress.update(len(chunk))
|
||||||
temp_file.write(chunk)
|
temp_file.write(chunk)
|
||||||
progress.close()
|
progress.close()
|
||||||
|
|
||||||
|
|
||||||
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10):
|
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False):
|
||||||
"""
|
"""
|
||||||
Given a URL, look for the corresponding dataset in the local cache.
|
Given a URL, look for the corresponding dataset in the local cache.
|
||||||
If it's not there, download it. Then return the path to the cached file.
|
If it's not there, download it. Then return the path to the cached file.
|
||||||
@@ -289,17 +296,35 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
|
|||||||
if matching_files:
|
if matching_files:
|
||||||
cache_path = os.path.join(cache_dir, matching_files[-1])
|
cache_path = os.path.join(cache_dir, matching_files[-1])
|
||||||
|
|
||||||
|
if resume_download:
|
||||||
|
incomplete_path = cache_path + '.incomplete'
|
||||||
|
@contextmanager
|
||||||
|
def _resumable_file_manager():
|
||||||
|
with open(incomplete_path,'a+b') as f:
|
||||||
|
yield f
|
||||||
|
os.remove(incomplete_path)
|
||||||
|
temp_file_manager = _resumable_file_manager
|
||||||
|
if os.path.exists(incomplete_path):
|
||||||
|
resume_size = os.stat(incomplete_path).st_size
|
||||||
|
else:
|
||||||
|
resume_size = 0
|
||||||
|
else:
|
||||||
|
temp_file_manager = tempfile.NamedTemporaryFile
|
||||||
|
resume_size = 0
|
||||||
|
|
||||||
if not os.path.exists(cache_path) or force_download:
|
if not os.path.exists(cache_path) or force_download:
|
||||||
# Download to temporary file, then copy to cache dir once finished.
|
# Download to temporary file, then copy to cache dir once finished.
|
||||||
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
||||||
with tempfile.NamedTemporaryFile() as temp_file:
|
with temp_file_manager() as temp_file:
|
||||||
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
|
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
|
||||||
|
|
||||||
# GET file object
|
# GET file object
|
||||||
if url.startswith("s3://"):
|
if url.startswith("s3://"):
|
||||||
|
if resume_download:
|
||||||
|
logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
|
||||||
s3_get(url, temp_file, proxies=proxies)
|
s3_get(url, temp_file, proxies=proxies)
|
||||||
else:
|
else:
|
||||||
http_get(url, temp_file, proxies=proxies)
|
http_get(url, temp_file, proxies=proxies, resume_size=resume_size)
|
||||||
|
|
||||||
# we are copying the file before closing it, so flush to avoid truncation
|
# we are copying the file before closing it, so flush to avoid truncation
|
||||||
temp_file.flush()
|
temp_file.flush()
|
||||||
|
|||||||
228
transformers/hf_api.py
Normal file
228
transformers/hf_api.py
Normal file
@@ -0,0 +1,228 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
from os.path import expanduser
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import six
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
ENDPOINT = "https://huggingface.co"
|
||||||
|
|
||||||
|
class S3Obj:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
filename, # type: str
|
||||||
|
LastModified, # type: str
|
||||||
|
ETag, # type: str
|
||||||
|
Size, # type: int
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
self.filename = filename
|
||||||
|
self.LastModified = LastModified
|
||||||
|
self.ETag = ETag
|
||||||
|
self.Size = Size
|
||||||
|
|
||||||
|
|
||||||
|
class PresignedUrl:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
write, # type: str
|
||||||
|
access, # type: str
|
||||||
|
type, # type: str
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
self.write = write
|
||||||
|
self.access = access
|
||||||
|
self.type = type # mime-type to send to S3.
|
||||||
|
|
||||||
|
|
||||||
|
class HfApi:
|
||||||
|
def __init__(self, endpoint=None):
|
||||||
|
self.endpoint = endpoint if endpoint is not None else ENDPOINT
|
||||||
|
|
||||||
|
def login(
|
||||||
|
self,
|
||||||
|
username, # type: str
|
||||||
|
password, # type: str
|
||||||
|
):
|
||||||
|
# type: (...) -> str
|
||||||
|
"""
|
||||||
|
Call HF API to sign in a user and get a token if credentials are valid.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
token if credentials are valid
|
||||||
|
|
||||||
|
Throws:
|
||||||
|
requests.exceptions.HTTPError if credentials are invalid
|
||||||
|
"""
|
||||||
|
path = "{}/api/login".format(self.endpoint)
|
||||||
|
r = requests.post(path, json={"username": username, "password": password})
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return d["token"]
|
||||||
|
|
||||||
|
def whoami(
|
||||||
|
self,
|
||||||
|
token, # type: str
|
||||||
|
):
|
||||||
|
# type: (...) -> str
|
||||||
|
"""
|
||||||
|
Call HF API to know "whoami"
|
||||||
|
"""
|
||||||
|
path = "{}/api/whoami".format(self.endpoint)
|
||||||
|
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return d["user"]
|
||||||
|
|
||||||
|
def logout(self, token):
|
||||||
|
# type: (...) -> void
|
||||||
|
"""
|
||||||
|
Call HF API to log out.
|
||||||
|
"""
|
||||||
|
path = "{}/api/logout".format(self.endpoint)
|
||||||
|
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
def presign(self, token, filename):
|
||||||
|
# type: (...) -> PresignedUrl
|
||||||
|
"""
|
||||||
|
Call HF API to get a presigned url to upload `filename` to S3.
|
||||||
|
"""
|
||||||
|
path = "{}/api/presign".format(self.endpoint)
|
||||||
|
r = requests.post(
|
||||||
|
path,
|
||||||
|
headers={"authorization": "Bearer {}".format(token)},
|
||||||
|
json={"filename": filename},
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return PresignedUrl(**d)
|
||||||
|
|
||||||
|
def presign_and_upload(self, token, filename, filepath):
|
||||||
|
# type: (...) -> str
|
||||||
|
"""
|
||||||
|
Get a presigned url, then upload file to S3.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
url: Read-only url for the stored file on S3.
|
||||||
|
"""
|
||||||
|
urls = self.presign(token, filename=filename)
|
||||||
|
# streaming upload:
|
||||||
|
# https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
|
||||||
|
#
|
||||||
|
# Even though we presign with the correct content-type,
|
||||||
|
# the client still has to specify it when uploading the file.
|
||||||
|
with open(filepath, "rb") as f:
|
||||||
|
pf = TqdmProgressFileReader(f)
|
||||||
|
|
||||||
|
r = requests.put(urls.write, data=f, headers={
|
||||||
|
"content-type": urls.type,
|
||||||
|
})
|
||||||
|
r.raise_for_status()
|
||||||
|
pf.close()
|
||||||
|
return urls.access
|
||||||
|
|
||||||
|
def list_objs(self, token):
|
||||||
|
# type: (...) -> List[S3Obj]
|
||||||
|
"""
|
||||||
|
Call HF API to list all stored files for user.
|
||||||
|
"""
|
||||||
|
path = "{}/api/listObjs".format(self.endpoint)
|
||||||
|
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||||
|
r.raise_for_status()
|
||||||
|
d = r.json()
|
||||||
|
return [S3Obj(**x) for x in d]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TqdmProgressFileReader:
|
||||||
|
"""
|
||||||
|
Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
|
||||||
|
and override `f.read()` so as to display a tqdm progress bar.
|
||||||
|
|
||||||
|
see github.com/huggingface/transformers/pull/2078#discussion_r354739608
|
||||||
|
for implementation details.
|
||||||
|
"""
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
f # type: io.BufferedReader
|
||||||
|
):
|
||||||
|
self.f = f
|
||||||
|
self.total_size = os.fstat(f.fileno()).st_size # type: int
|
||||||
|
self.pbar = tqdm(total=self.total_size, leave=False)
|
||||||
|
if six.PY3:
|
||||||
|
# does not work unless PY3
|
||||||
|
# no big deal as the CLI does not currently support PY2 anyways.
|
||||||
|
self.read = f.read
|
||||||
|
f.read = self._read
|
||||||
|
|
||||||
|
def _read(self, n=-1):
|
||||||
|
self.pbar.update(n)
|
||||||
|
return self.read(n)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.pbar.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class HfFolder:
|
||||||
|
path_token = expanduser("~/.huggingface/token")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def save_token(cls, token):
|
||||||
|
"""
|
||||||
|
Save token, creating folder as needed.
|
||||||
|
"""
|
||||||
|
if six.PY3:
|
||||||
|
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
||||||
|
else:
|
||||||
|
# Python 2
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(cls.path_token))
|
||||||
|
except OSError as e:
|
||||||
|
if e.errno != os.errno.EEXIST:
|
||||||
|
raise e
|
||||||
|
pass
|
||||||
|
with open(cls.path_token, 'w+') as f:
|
||||||
|
f.write(token)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_token(cls):
|
||||||
|
"""
|
||||||
|
Get token or None if not existent.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(cls.path_token, 'r') as f:
|
||||||
|
return f.read()
|
||||||
|
except:
|
||||||
|
# this is too wide. When Py2 is dead use:
|
||||||
|
# `except FileNotFoundError:` instead
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def delete_token(cls):
|
||||||
|
"""
|
||||||
|
Delete token.
|
||||||
|
Do not fail if token does not exist.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
os.remove(cls.path_token)
|
||||||
|
except:
|
||||||
|
return
|
||||||
801
transformers/modeling_albert.py
Normal file
801
transformers/modeling_albert.py
Normal file
@@ -0,0 +1,801 @@
|
|||||||
|
|
||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""PyTorch ALBERT model. """
|
||||||
|
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
import logging
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
from transformers.modeling_utils import PreTrainedModel
|
||||||
|
from transformers.configuration_albert import AlbertConfig
|
||||||
|
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention, prune_linear_layer, ACT2FN
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
|
||||||
|
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
|
||||||
|
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
|
||||||
|
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
|
||||||
|
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
|
||||||
|
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
|
||||||
|
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
|
||||||
|
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
|
||||||
|
""" Load tf checkpoints in a pytorch model."""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
except ImportError:
|
||||||
|
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
|
||||||
|
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||||
|
raise
|
||||||
|
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||||
|
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||||
|
# Load weights from TF model
|
||||||
|
init_vars = tf.train.list_variables(tf_path)
|
||||||
|
names = []
|
||||||
|
arrays = []
|
||||||
|
for name, shape in init_vars:
|
||||||
|
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||||
|
array = tf.train.load_variable(tf_path, name)
|
||||||
|
names.append(name)
|
||||||
|
arrays.append(array)
|
||||||
|
|
||||||
|
for name, array in zip(names, arrays):
|
||||||
|
print(name)
|
||||||
|
|
||||||
|
for name, array in zip(names, arrays):
|
||||||
|
original_name = name
|
||||||
|
|
||||||
|
# If saved from the TF HUB module
|
||||||
|
name = name.replace("module/", "")
|
||||||
|
|
||||||
|
# Renaming and simplifying
|
||||||
|
name = name.replace("ffn_1", "ffn")
|
||||||
|
name = name.replace("bert/", "albert/")
|
||||||
|
name = name.replace("attention_1", "attention")
|
||||||
|
name = name.replace("transform/", "")
|
||||||
|
name = name.replace("LayerNorm_1", "full_layer_layer_norm")
|
||||||
|
name = name.replace("LayerNorm", "attention/LayerNorm")
|
||||||
|
name = name.replace("transformer/", "")
|
||||||
|
|
||||||
|
# The feed forward layer had an 'intermediate' step which has been abstracted away
|
||||||
|
name = name.replace("intermediate/dense/", "")
|
||||||
|
name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
|
||||||
|
|
||||||
|
# ALBERT attention was split between self and output which have been abstracted away
|
||||||
|
name = name.replace("/output/", "/")
|
||||||
|
name = name.replace("/self/", "/")
|
||||||
|
|
||||||
|
# The pooler is a linear layer
|
||||||
|
name = name.replace("pooler/dense", "pooler")
|
||||||
|
|
||||||
|
# The classifier was simplified to predictions from cls/predictions
|
||||||
|
name = name.replace("cls/predictions", "predictions")
|
||||||
|
name = name.replace("predictions/attention", "predictions")
|
||||||
|
|
||||||
|
# Naming was changed to be more explicit
|
||||||
|
name = name.replace("embeddings/attention", "embeddings")
|
||||||
|
name = name.replace("inner_group_", "albert_layers/")
|
||||||
|
name = name.replace("group_", "albert_layer_groups/")
|
||||||
|
|
||||||
|
# Classifier
|
||||||
|
if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
|
||||||
|
name = "classifier/" + name
|
||||||
|
|
||||||
|
# No ALBERT model currently handles the next sentence prediction task
|
||||||
|
if "seq_relationship" in name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = name.split('/')
|
||||||
|
|
||||||
|
# Ignore the gradients applied by the LAMB/ADAM optimizers.
|
||||||
|
if "adam_m" in name or "adam_v" in name or "global_step" in name:
|
||||||
|
logger.info("Skipping {}".format("/".join(name)))
|
||||||
|
continue
|
||||||
|
|
||||||
|
pointer = model
|
||||||
|
for m_name in name:
|
||||||
|
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
|
||||||
|
l = re.split(r'_(\d+)', m_name)
|
||||||
|
else:
|
||||||
|
l = [m_name]
|
||||||
|
|
||||||
|
if l[0] == 'kernel' or l[0] == 'gamma':
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
elif l[0] == 'output_bias' or l[0] == 'beta':
|
||||||
|
pointer = getattr(pointer, 'bias')
|
||||||
|
elif l[0] == 'output_weights':
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
elif l[0] == 'squad':
|
||||||
|
pointer = getattr(pointer, 'classifier')
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
pointer = getattr(pointer, l[0])
|
||||||
|
except AttributeError:
|
||||||
|
logger.info("Skipping {}".format("/".join(name)))
|
||||||
|
continue
|
||||||
|
if len(l) >= 2:
|
||||||
|
num = int(l[1])
|
||||||
|
pointer = pointer[num]
|
||||||
|
|
||||||
|
if m_name[-11:] == '_embeddings':
|
||||||
|
pointer = getattr(pointer, 'weight')
|
||||||
|
elif m_name == 'kernel':
|
||||||
|
array = np.transpose(array)
|
||||||
|
try:
|
||||||
|
assert pointer.shape == array.shape
|
||||||
|
except AssertionError as e:
|
||||||
|
e.args += (pointer.shape, array.shape)
|
||||||
|
raise
|
||||||
|
print("Initialize PyTorch weight {} from {}".format(name, original_name))
|
||||||
|
pointer.data = torch.from_numpy(array)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertEmbeddings(BertEmbeddings):
|
||||||
|
"""
|
||||||
|
Construct the embeddings from word, position and token_type embeddings.
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertEmbeddings, self).__init__(config)
|
||||||
|
|
||||||
|
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
|
||||||
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
|
||||||
|
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
|
||||||
|
self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertAttention(BertSelfAttention):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertAttention, self).__init__(config)
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.num_attention_heads = config.num_attention_heads
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.attention_head_size = config.hidden_size // config.num_attention_heads
|
||||||
|
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||||
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
|
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
if len(heads) == 0:
|
||||||
|
return
|
||||||
|
mask = torch.ones(self.num_attention_heads, self.attention_head_size)
|
||||||
|
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||||
|
for head in heads:
|
||||||
|
# Compute how many pruned heads are before the head and move the index accordingly
|
||||||
|
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
|
mask[head] = 0
|
||||||
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
|
index = torch.arange(len(mask))[mask].long()
|
||||||
|
|
||||||
|
# Prune linear layers
|
||||||
|
self.query = prune_linear_layer(self.query, index)
|
||||||
|
self.key = prune_linear_layer(self.key, index)
|
||||||
|
self.value = prune_linear_layer(self.value, index)
|
||||||
|
self.dense = prune_linear_layer(self.dense, index, dim=1)
|
||||||
|
|
||||||
|
# Update hyper params and store pruned heads
|
||||||
|
self.num_attention_heads = self.num_attention_heads - len(heads)
|
||||||
|
self.all_head_size = self.attention_head_size * self.num_attention_heads
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
|
def forward(self, input_ids, attention_mask=None, head_mask=None):
|
||||||
|
mixed_query_layer = self.query(input_ids)
|
||||||
|
mixed_key_layer = self.key(input_ids)
|
||||||
|
mixed_value_layer = self.value(input_ids)
|
||||||
|
|
||||||
|
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||||
|
key_layer = self.transpose_for_scores(mixed_key_layer)
|
||||||
|
value_layer = self.transpose_for_scores(mixed_value_layer)
|
||||||
|
|
||||||
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
|
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||||
|
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
|
# Normalize the attention scores to probabilities.
|
||||||
|
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||||
|
|
||||||
|
# This is actually dropping out entire tokens to attend to, which might
|
||||||
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||||
|
attention_probs = self.dropout(attention_probs)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
attention_probs = attention_probs * head_mask
|
||||||
|
|
||||||
|
context_layer = torch.matmul(attention_probs, value_layer)
|
||||||
|
|
||||||
|
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||||
|
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
||||||
|
reshaped_context_layer = context_layer.view(*new_context_layer_shape)
|
||||||
|
|
||||||
|
|
||||||
|
# Should find a better way to do this
|
||||||
|
w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
|
||||||
|
b = self.dense.bias.to(context_layer.dtype)
|
||||||
|
|
||||||
|
projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
|
||||||
|
projected_context_layer_dropout = self.dropout(projected_context_layer)
|
||||||
|
layernormed_context_layer = self.LayerNorm(input_ids + projected_context_layer_dropout)
|
||||||
|
return (layernormed_context_layer, attention_probs) if self.output_attentions else (layernormed_context_layer,)
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertLayer(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertLayer, self).__init__()
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
|
self.attention = AlbertAttention(config)
|
||||||
|
self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||||
|
self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
|
attention_output = self.attention(hidden_states, attention_mask, head_mask)
|
||||||
|
ffn_output = self.ffn(attention_output[0])
|
||||||
|
ffn_output = self.activation(ffn_output)
|
||||||
|
ffn_output = self.ffn_output(ffn_output)
|
||||||
|
hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
|
||||||
|
|
||||||
|
return (hidden_states,) + attention_output[1:] # add attentions if we output them
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertLayerGroup(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertLayerGroup, self).__init__()
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
|
layer_hidden_states = ()
|
||||||
|
layer_attentions = ()
|
||||||
|
|
||||||
|
for layer_index, albert_layer in enumerate(self.albert_layers):
|
||||||
|
layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index])
|
||||||
|
hidden_states = layer_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
layer_attentions = layer_attentions + (layer_output[1],)
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
layer_hidden_states = layer_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (layer_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (layer_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (layer hidden states), (layer attentions)
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertTransformer(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertTransformer, self).__init__()
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
|
||||||
|
self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
|
||||||
|
|
||||||
|
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||||
|
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
|
||||||
|
|
||||||
|
all_attentions = ()
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = (hidden_states,)
|
||||||
|
|
||||||
|
for i in range(self.config.num_hidden_layers):
|
||||||
|
# Number of layers in a hidden group
|
||||||
|
layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
|
||||||
|
|
||||||
|
# Index of the hidden group
|
||||||
|
group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
|
||||||
|
|
||||||
|
# Index of the layer inside the group
|
||||||
|
layer_idx = int(i - group_idx * layers_per_group)
|
||||||
|
|
||||||
|
layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])
|
||||||
|
hidden_states = layer_group_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions = all_attentions + layer_group_output[-1]
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertPreTrainedModel(PreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = AlbertConfig
|
||||||
|
pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
base_model_prefix = "albert"
|
||||||
|
|
||||||
|
def _init_weights(self, module):
|
||||||
|
""" Initialize the weights.
|
||||||
|
"""
|
||||||
|
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||||
|
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||||
|
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||||
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
|
if isinstance(module, (nn.Linear)) and module.bias is not None:
|
||||||
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
module.bias.data.zero_()
|
||||||
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
|
|
||||||
|
ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in
|
||||||
|
`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
|
||||||
|
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
|
||||||
|
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
|
||||||
|
|
||||||
|
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
|
||||||
|
refer to the PyTorch documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
|
||||||
|
https://arxiv.org/abs/1909.11942
|
||||||
|
|
||||||
|
.. _`torch.nn.Module`:
|
||||||
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ALBERT_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
|
Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
|
||||||
|
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertModel(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
|
||||||
|
Last layer hidden-state of the first token of the sequence (classification token)
|
||||||
|
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||||
|
layer weights are trained from the next sentence prediction (classification)
|
||||||
|
objective during Bert pretraining. This output is usually *not* a good summary
|
||||||
|
of the semantic content of the input, you're often better with averaging or pooling
|
||||||
|
the sequence of hidden-states for the whole input sequence.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
config_class = AlbertConfig
|
||||||
|
pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
load_tf_weights = load_tf_weights_in_albert
|
||||||
|
base_model_prefix = "albert"
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertModel, self).__init__(config)
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.embeddings = AlbertEmbeddings(config)
|
||||||
|
self.encoder = AlbertTransformer(config)
|
||||||
|
self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
|
||||||
|
self.pooler_activation = nn.Tanh()
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.embeddings.word_embeddings
|
||||||
|
|
||||||
|
def set_input_embeddings(self, value):
|
||||||
|
self.embeddings.word_embeddings = value
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
old_embeddings = self.embeddings.word_embeddings
|
||||||
|
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||||
|
self.embeddings.word_embeddings = new_embeddings
|
||||||
|
return self.embeddings.word_embeddings
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
""" Prunes heads of the model.
|
||||||
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
|
ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
|
||||||
|
If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
|
||||||
|
is a total of 4 different layers.
|
||||||
|
|
||||||
|
These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
|
||||||
|
while [2,3] correspond to the two inner groups of the second hidden layer.
|
||||||
|
|
||||||
|
Any layer with in index other than [0,1,2,3] will result in an error.
|
||||||
|
See base class PreTrainedModel for more information about head pruning
|
||||||
|
"""
|
||||||
|
for layer, heads in heads_to_prune.items():
|
||||||
|
group_idx = int(layer / self.config.inner_group_num)
|
||||||
|
inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
|
||||||
|
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
|
inputs_embeds=None):
|
||||||
|
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = torch.ones(input_shape, device=device)
|
||||||
|
if token_type_ids is None:
|
||||||
|
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||||
|
|
||||||
|
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||||
|
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||||
|
if head_mask is not None:
|
||||||
|
if head_mask.dim() == 1:
|
||||||
|
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||||
|
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
|
||||||
|
elif head_mask.dim() == 2:
|
||||||
|
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
|
||||||
|
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.config.num_hidden_layers
|
||||||
|
|
||||||
|
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
encoder_outputs = self.encoder(embedding_output,
|
||||||
|
extended_attention_mask,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
|
sequence_output = encoder_outputs[0]
|
||||||
|
|
||||||
|
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
|
||||||
|
|
||||||
|
outputs = (sequence_output, pooled_output) + encoder_outputs[1:] # add hidden_states and attentions if they are here
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
class AlbertMLMHead(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertMLMHead, self).__init__()
|
||||||
|
|
||||||
|
self.LayerNorm = nn.LayerNorm(config.embedding_size)
|
||||||
|
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||||
|
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
|
||||||
|
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.activation(hidden_states)
|
||||||
|
hidden_states = self.LayerNorm(hidden_states)
|
||||||
|
hidden_states = self.decoder(hidden_states)
|
||||||
|
|
||||||
|
prediction_scores = hidden_states + self.bias
|
||||||
|
|
||||||
|
return prediction_scores
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the masked language modeling loss.
|
||||||
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
|
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Masked language modeling loss.
|
||||||
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertForMaskedLM, self).__init__(config)
|
||||||
|
|
||||||
|
self.albert = AlbertModel(config)
|
||||||
|
self.predictions = AlbertMLMHead(config)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
|
def tie_weights(self):
|
||||||
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
|
"""
|
||||||
|
self._tie_or_clone_weights(self.predictions.decoder,
|
||||||
|
self.albert.embeddings.word_embeddings)
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.predictions.decoder
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
|
masked_lm_labels=None):
|
||||||
|
outputs = self.albert(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
sequence_outputs = outputs[0]
|
||||||
|
|
||||||
|
prediction_scores = self.predictions(sequence_outputs)
|
||||||
|
|
||||||
|
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||||
|
if masked_lm_labels is not None:
|
||||||
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
|
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||||
|
outputs = (masked_lm_loss,) + outputs
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||||
|
the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for computing the sequence classification/regression loss.
|
||||||
|
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||||
|
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||||
|
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Classification (or regression if config.num_labels==1) loss.
|
||||||
|
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||||
|
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, labels=labels)
|
||||||
|
loss, logits = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertForSequenceClassification, self).__init__(config)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.albert = AlbertModel(config)
|
||||||
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
|
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
|
outputs = self.albert(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
|
||||||
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
|
pooled_output = self.dropout(pooled_output)
|
||||||
|
logits = self.classifier(pooled_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
|
||||||
|
if labels is not None:
|
||||||
|
if self.num_labels == 1:
|
||||||
|
# We are doing regression
|
||||||
|
loss_fct = MSELoss()
|
||||||
|
loss = loss_fct(logits.view(-1), labels.view(-1))
|
||||||
|
else:
|
||||||
|
loss_fct = CrossEntropyLoss()
|
||||||
|
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||||
|
outputs = (loss,) + outputs
|
||||||
|
|
||||||
|
return outputs # (loss), logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||||
|
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||||
|
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||||
|
Position outside of the sequence are not taken into account for computing the loss.
|
||||||
|
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||||
|
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||||
|
Position outside of the sequence are not taken into account for computing the loss.
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||||
|
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-start scores (before SoftMax).
|
||||||
|
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||||
|
Span-end scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
|
||||||
|
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||||
|
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
|
||||||
|
input_ids = tokenizer.encode(input_text)
|
||||||
|
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||||
|
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
|
||||||
|
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||||
|
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
|
||||||
|
# a nice puppet
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(AlbertForQuestionAnswering, self).__init__(config)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.albert = AlbertModel(config)
|
||||||
|
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
|
inputs_embeds=None, start_positions=None, end_positions=None):
|
||||||
|
|
||||||
|
outputs = self.albert(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds
|
||||||
|
)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
logits = self.qa_outputs(sequence_output)
|
||||||
|
start_logits, end_logits = logits.split(1, dim=-1)
|
||||||
|
start_logits = start_logits.squeeze(-1)
|
||||||
|
end_logits = end_logits.squeeze(-1)
|
||||||
|
|
||||||
|
outputs = (start_logits, end_logits,) + outputs[2:]
|
||||||
|
if start_positions is not None and end_positions is not None:
|
||||||
|
# If we are on multi-GPU, split add a dimension
|
||||||
|
if len(start_positions.size()) > 1:
|
||||||
|
start_positions = start_positions.squeeze(-1)
|
||||||
|
if len(end_positions.size()) > 1:
|
||||||
|
end_positions = end_positions.squeeze(-1)
|
||||||
|
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
||||||
|
ignored_index = start_logits.size(1)
|
||||||
|
start_positions.clamp_(0, ignored_index)
|
||||||
|
end_positions.clamp_(0, ignored_index)
|
||||||
|
|
||||||
|
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
|
||||||
|
start_loss = loss_fct(start_logits, start_positions)
|
||||||
|
end_loss = loss_fct(end_logits, end_positions)
|
||||||
|
total_loss = (start_loss + end_loss) / 2
|
||||||
|
outputs = (total_loss,) + outputs
|
||||||
|
|
||||||
|
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
|
||||||
@@ -27,6 +27,9 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi
|
|||||||
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
||||||
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
||||||
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
||||||
|
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||||
|
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||||
|
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
|
||||||
from .modeling_t5 import T5Model, T5WithLMHeadModel
|
from .modeling_t5 import T5Model, T5WithLMHeadModel
|
||||||
|
|
||||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||||
@@ -50,14 +53,16 @@ class AutoModel(object):
|
|||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `t5`: T5Model (T5 model)
|
- contains `t5`: T5Model (T5 model)
|
||||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertModel (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||||
- contains `bert`: BertModel (Bert model)
|
- contains `bert`: BertModel (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
|
||||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetModel (XLNet model)
|
- contains `xlnet`: XLNetModel (XLNet model)
|
||||||
- contains `xlm`: XLMModel (XLM model)
|
- contains `xlm`: XLMModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||||
|
|
||||||
This class cannot be instantiated using `__init__()` (throws an error).
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
"""
|
"""
|
||||||
@@ -74,14 +79,16 @@ class AutoModel(object):
|
|||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `t5`: T5Model (T5 model)
|
- contains `t5`: T5Model (T5 model)
|
||||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertModel (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||||
- contains `bert`: BertModel (Bert model)
|
- contains `bert`: BertModel (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
|
||||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetModel (XLNet model)
|
- contains `xlnet`: XLNetModel (XLNet model)
|
||||||
- contains `xlm`: XLMModel (XLM model)
|
- contains `xlm`: XLMModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||||
|
|
||||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
To train the model, you should first set it back in training mode with `model.train()`
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
@@ -115,6 +122,9 @@ class AutoModel(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -143,6 +153,10 @@ class AutoModel(object):
|
|||||||
return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'distilbert' in pretrained_model_name_or_path:
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
|
return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -161,7 +175,7 @@ class AutoModel(object):
|
|||||||
return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
"'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path))
|
"'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
class AutoModelWithLMHead(object):
|
class AutoModelWithLMHead(object):
|
||||||
@@ -178,14 +192,16 @@ class AutoModelWithLMHead(object):
|
|||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `t5`: T5ModelWithLMHead (T5 model)
|
- contains `t5`: T5ModelWithLMHead (T5 model)
|
||||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||||
- contains `bert`: BertForMaskedLM (Bert model)
|
- contains `bert`: BertForMaskedLM (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
- contains `ctrl`: CTRLLMModel (Salesforce CTRL model)
|
|
||||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)
|
||||||
|
|
||||||
This class cannot be instantiated using `__init__()` (throws an error).
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
"""
|
"""
|
||||||
@@ -205,6 +221,8 @@ class AutoModelWithLMHead(object):
|
|||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `t5`: T5ModelWithLMHead (T5 model)
|
- contains `t5`: T5ModelWithLMHead (T5 model)
|
||||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||||
- contains `bert`: BertForMaskedLM (Bert model)
|
- contains `bert`: BertForMaskedLM (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
@@ -212,6 +230,7 @@ class AutoModelWithLMHead(object):
|
|||||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||||
|
- contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)
|
||||||
|
|
||||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
To train the model, you should first set it back in training mode with `model.train()`
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
@@ -244,6 +263,8 @@ class AutoModelWithLMHead(object):
|
|||||||
|
|
||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
@@ -273,6 +294,10 @@ class AutoModelWithLMHead(object):
|
|||||||
return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'distilbert' in pretrained_model_name_or_path:
|
elif 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
|
return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -291,7 +316,7 @@ class AutoModelWithLMHead(object):
|
|||||||
return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
"'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path))
|
"'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForSequenceClassification(object):
|
class AutoModelForSequenceClassification(object):
|
||||||
@@ -307,6 +332,8 @@ class AutoModelForSequenceClassification(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForSequenceClassification (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||||
@@ -329,6 +356,8 @@ class AutoModelForSequenceClassification(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForSequenceClassification (ALBERT model)
|
||||||
|
- contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
|
||||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||||
@@ -366,6 +395,9 @@ class AutoModelForSequenceClassification(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -392,6 +424,10 @@ class AutoModelForSequenceClassification(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'camembert' in pretrained_model_name_or_path:
|
||||||
|
return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
@@ -402,7 +438,7 @@ class AutoModelForSequenceClassification(object):
|
|||||||
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
"'bert', 'xlnet', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
class AutoModelForQuestionAnswering(object):
|
class AutoModelForQuestionAnswering(object):
|
||||||
@@ -418,6 +454,7 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForQuestionAnswering (ALBERT model)
|
||||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||||
@@ -439,6 +476,7 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
The model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `albert`: AlbertForQuestionAnswering (ALBERT model)
|
||||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||||
@@ -501,6 +539,8 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
"""
|
"""
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'albert' in pretrained_model_name_or_path:
|
||||||
|
return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
elif 'xlnet' in pretrained_model_name_or_path:
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
@@ -509,4 +549,4 @@ class AutoModelForQuestionAnswering(object):
|
|||||||
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
|
"'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
|
||||||
|
|||||||
@@ -1,271 +0,0 @@
|
|||||||
# coding=utf-8
|
|
||||||
# Copyright (c) 2019 Yang Liu
|
|
||||||
|
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
# of this software and associated documentation files (the "Software"), to deal
|
|
||||||
# in the Software without restriction, including without limitation the rights
|
|
||||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
# copies of the Software, and to permit persons to whom the Software is
|
|
||||||
# furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
# The above copyright notice and this permission notice shall be included in all
|
|
||||||
# copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
# SOFTWARE.
|
|
||||||
"""
|
|
||||||
A general wrapper around models with LM heads to generate sequences
|
|
||||||
using beam search.
|
|
||||||
"""
|
|
||||||
import torch
|
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerBeamSearch(nn.Module):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model,
|
|
||||||
tokenizer,
|
|
||||||
batch_size,
|
|
||||||
beam_size,
|
|
||||||
min_length,
|
|
||||||
max_length,
|
|
||||||
alpha=0,
|
|
||||||
block_repeating_trigram=True,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Attributes:
|
|
||||||
mask_word_id: token id that corresponds to the mask
|
|
||||||
"""
|
|
||||||
super(TransformerBeamSearch, self).__init__()
|
|
||||||
self.model = model
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
|
|
||||||
self.start_token_id = tokenizer.start_token_id
|
|
||||||
self.end_token_id = tokenizer.end_token_id
|
|
||||||
self.pad_token_id = tokenizer.pad_token_id
|
|
||||||
|
|
||||||
self.beam_size = beam_size
|
|
||||||
self.min_length = min_length
|
|
||||||
self.max_length = max_length
|
|
||||||
|
|
||||||
self.block_repeating_trigram = block_repeating_trigram
|
|
||||||
self.apply_length_penalty = False if alpha == 0 else True
|
|
||||||
self.alpha = alpha
|
|
||||||
|
|
||||||
# State of the beam
|
|
||||||
self.hypotheses = [[] for _ in range(batch_size)]
|
|
||||||
self.batch_offset = torch.arange(batch_size, dtype=torch.long)
|
|
||||||
self.beam_offset = torch.arange(
|
|
||||||
0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
|
|
||||||
)
|
|
||||||
self.growing_beam = torch.full(
|
|
||||||
(batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
|
|
||||||
)
|
|
||||||
self.topk_log_probabilities = torch.tensor(
|
|
||||||
[0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
|
|
||||||
).repeat(batch_size)
|
|
||||||
self.results = {
|
|
||||||
"prediction": [[] for _ in batch_size],
|
|
||||||
"scores": [[] for _ in batch_size],
|
|
||||||
}
|
|
||||||
self._step = 0
|
|
||||||
self.is_done = False
|
|
||||||
|
|
||||||
def step(self, log_probabilities):
|
|
||||||
""" Grows the beam by one step. """
|
|
||||||
self._step += 1
|
|
||||||
|
|
||||||
# The batch size changes as some beams finish so we define _B
|
|
||||||
vocab_size = log_probabilities.size(-1)
|
|
||||||
_B = log_probabilities.size(0) // self.beam_size
|
|
||||||
|
|
||||||
# Multiply each beam probability with the probability of the
|
|
||||||
# next token (conditioned on the words in the beam).
|
|
||||||
log_probabilities += self.topk_log_probabilities.view(-1, 1)
|
|
||||||
|
|
||||||
self.enforce_min_length(log_probabilities)
|
|
||||||
if self.block_repeating_trigram:
|
|
||||||
self.remove_repeating_trigrams(log_probabilities, _B)
|
|
||||||
|
|
||||||
# Find the `beam_size` (previous_beam + token) combinations with
|
|
||||||
# the highest score
|
|
||||||
topk_log_probabilities, topk_ids = log_probabilities.topk(
|
|
||||||
log_probabilities.view(_B, self.beam_size * vocab_size),
|
|
||||||
self.beam_size,
|
|
||||||
dim=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Apply the length penalty. The +1 accounts for the [EOS] token
|
|
||||||
# that will be added if the beam ends.
|
|
||||||
topk_scores = topk_log_probabilities / self.length_penalty()
|
|
||||||
|
|
||||||
# Retrieve the corresponding respective beam and token id
|
|
||||||
# topk_token_ids[i] will be added to topk_beam_ids[i]
|
|
||||||
topk_beam_ids = topk_ids.div(vocab_size)
|
|
||||||
topk_token_ids = topk_ids.fmod(vocab_size)
|
|
||||||
|
|
||||||
# Retrieve the row index of the surviving beams in the original
|
|
||||||
# view of the log_probabilities tensor
|
|
||||||
surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view(
|
|
||||||
-1
|
|
||||||
)
|
|
||||||
|
|
||||||
# Append the last predictions
|
|
||||||
self.growing_beam = torch.cat(
|
|
||||||
[
|
|
||||||
self.growing_beam.index_select(0, surviving_beams_rows),
|
|
||||||
topk_token_ids.view(-1, 1),
|
|
||||||
],
|
|
||||||
1,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if any of the beam searches has ended during this
|
|
||||||
# growth step. Also if top beam (most probable) has ended
|
|
||||||
# for one element of the batch.
|
|
||||||
is_finished = topk_token_ids.eq(self.end_token_id)
|
|
||||||
self.enforce_max_length()
|
|
||||||
is_top_beam_finished = is_finished[:, 0].eq(1)
|
|
||||||
|
|
||||||
# Save the finished searches
|
|
||||||
if is_finished.any():
|
|
||||||
predictions = self.growing_beam.view(
|
|
||||||
-1, self.beam_size, self.growing_beam.size(1)
|
|
||||||
)
|
|
||||||
for i in range(is_finished.size(0)):
|
|
||||||
if is_top_beam_finished[i]:
|
|
||||||
is_finished[i].fill_(1)
|
|
||||||
finished_hyp = is_finished[i].nonzero().view(-1)
|
|
||||||
|
|
||||||
# Store finished hypotheses for this batch.
|
|
||||||
b = self.batch_offset[i]
|
|
||||||
for j in finished_hyp:
|
|
||||||
self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
|
|
||||||
|
|
||||||
# If the batch reached the end, save the best hypotheses
|
|
||||||
# in terms of length-penalized score.
|
|
||||||
if is_top_beam_finished[i]:
|
|
||||||
best_hyp = sorted(
|
|
||||||
self.hypotheses[b], key=lambda x: x[0], reverse=True
|
|
||||||
)
|
|
||||||
best_score, best_prediction = best_hyp[0]
|
|
||||||
self.results["scores"][b].append(best_score)
|
|
||||||
self.results["predictions"][b].append(best_prediction)
|
|
||||||
|
|
||||||
non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
|
|
||||||
if len(non_finished) == 0:
|
|
||||||
self.is_done = True
|
|
||||||
|
|
||||||
# Remove finished batches for the next step.
|
|
||||||
topk_log_probabilities = topk_log_probabilities.index_select(
|
|
||||||
0, non_finished
|
|
||||||
)
|
|
||||||
self.batch_offset = self.batch_offset.index_select(0, non_finished)
|
|
||||||
self.growing_beam = predictions.index_select(0, non_finished).view(
|
|
||||||
-1, self.growing_beam.size(-1)
|
|
||||||
)
|
|
||||||
|
|
||||||
surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
|
|
||||||
|
|
||||||
return surviving_beams_rows
|
|
||||||
|
|
||||||
def forward(self, encoder_input_ids, **kwargs):
|
|
||||||
# keyword arguments come in 3 flavors: encoder-specific (prefixed by
|
|
||||||
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
|
|
||||||
# that apply to the model as whole.
|
|
||||||
# We let the specific kwargs override the common ones in case of conflict.
|
|
||||||
kwargs_encoder = {
|
|
||||||
argument[len("encoder_"):]: value
|
|
||||||
for argument, value in kwargs.items()
|
|
||||||
if argument.startswith("encoder_")
|
|
||||||
}
|
|
||||||
kwargs_decoder = {
|
|
||||||
argument[len("decoder_"):]: value
|
|
||||||
for argument, value in kwargs.items()
|
|
||||||
if argument.startswith("decoder_")
|
|
||||||
}
|
|
||||||
kwargs_common = {
|
|
||||||
argument: value
|
|
||||||
for argument, value in kwargs.items()
|
|
||||||
if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
|
|
||||||
}
|
|
||||||
kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
|
|
||||||
kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
|
|
||||||
|
|
||||||
# forward pass on the encoder
|
|
||||||
encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
|
|
||||||
kwargs_decoder["encoder_hidden_states"] = tile(
|
|
||||||
encoder_outputs, self.beam_size, dim=0
|
|
||||||
)
|
|
||||||
|
|
||||||
# grow the beam by generating sequences in an autoregressive way
|
|
||||||
self.growing_beam = torch.full(
|
|
||||||
(self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
|
|
||||||
)
|
|
||||||
for step in range(self.max_length):
|
|
||||||
decoder_input = self.growing_beam[:, -1]
|
|
||||||
outputs = self.model.decoder(decoder_input, kwargs_decoder)
|
|
||||||
log_probabilities = torch.nn.functional.log_softmax(outputs[1])
|
|
||||||
surviving_beams_rows = self.step(log_probabilities)
|
|
||||||
if self.is_done:
|
|
||||||
break
|
|
||||||
|
|
||||||
kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
|
|
||||||
"encoder_hidden_states"
|
|
||||||
].index_select(0, surviving_beams_rows)
|
|
||||||
|
|
||||||
return self.results
|
|
||||||
|
|
||||||
def remove_repeating_trigrams(self, log_probabilities, _B):
|
|
||||||
if(self._step + 1 > 3):
|
|
||||||
for i in range(_B * self.beam_size):
|
|
||||||
tokens = [t for t in self.growing_beam[i]]
|
|
||||||
trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
|
|
||||||
last_trigram = tuple(trigrams[-1])
|
|
||||||
if last_trigram in trigrams[:-1]:
|
|
||||||
log_probabilities[i] = -1e20
|
|
||||||
|
|
||||||
def enforce_min_length(self):
|
|
||||||
if self._step < self.min_length:
|
|
||||||
self.log_probabilities[self.end_token_id] = -1e20
|
|
||||||
|
|
||||||
def enforce_max_length(self):
|
|
||||||
if self._step + 1 == self.max_length:
|
|
||||||
self.is_finished.fill_(1)
|
|
||||||
|
|
||||||
def length_penalty(self):
|
|
||||||
return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
|
|
||||||
|
|
||||||
|
|
||||||
def tile(x, count, dim=0):
|
|
||||||
"""
|
|
||||||
Tiles `x` along dimension `dim` `count` times.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>> ex = torch.tensor([1,2],[3,4])
|
|
||||||
>> tile(ex, 2, 0)
|
|
||||||
torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
|
|
||||||
"""
|
|
||||||
perm = list(range(len(x.size())))
|
|
||||||
if dim != 0:
|
|
||||||
perm[0], perm[dim] = perm[dim], perm[0]
|
|
||||||
x = x.permute(perm).contiguous()
|
|
||||||
out_size = list(x.size())
|
|
||||||
out_size[0] *= count
|
|
||||||
batch = x.size(0)
|
|
||||||
x = (
|
|
||||||
x.view(batch, -1)
|
|
||||||
.transpose(0, 1)
|
|
||||||
.repeat(count, 1)
|
|
||||||
.transpose(0, 1)
|
|
||||||
.contiguous()
|
|
||||||
.view(*out_size)
|
|
||||||
)
|
|
||||||
if dim != 0:
|
|
||||||
x = x.permute(perm).contiguous()
|
|
||||||
return x
|
|
||||||
@@ -138,7 +138,11 @@ def swish(x):
|
|||||||
return x * torch.sigmoid(x)
|
return x * torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
|
def mish(x):
|
||||||
|
return x * torch.tanh(nn.functional.softplus(x))
|
||||||
|
|
||||||
|
|
||||||
|
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
|
||||||
|
|
||||||
|
|
||||||
BertLayerNorm = torch.nn.LayerNorm
|
BertLayerNorm = torch.nn.LayerNorm
|
||||||
@@ -158,19 +162,26 @@ class BertEmbeddings(nn.Module):
|
|||||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, position_ids=None):
|
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||||
seq_length = input_ids.size(1)
|
if input_ids is not None:
|
||||||
if position_ids is None:
|
input_shape = input_ids.size()
|
||||||
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
|
else:
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
if token_type_ids is None:
|
|
||||||
token_type_ids = torch.zeros_like(input_ids)
|
|
||||||
|
|
||||||
words_embeddings = self.word_embeddings(input_ids)
|
seq_length = input_shape[1]
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
|
||||||
|
position_ids = position_ids.unsqueeze(0).expand(input_shape)
|
||||||
|
if token_type_ids is None:
|
||||||
|
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||||
|
|
||||||
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = self.word_embeddings(input_ids)
|
||||||
position_embeddings = self.position_embeddings(position_ids)
|
position_embeddings = self.position_embeddings(position_ids)
|
||||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||||
|
|
||||||
embeddings = words_embeddings + position_embeddings + token_type_embeddings
|
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
|
||||||
embeddings = self.LayerNorm(embeddings)
|
embeddings = self.LayerNorm(embeddings)
|
||||||
embeddings = self.dropout(embeddings)
|
embeddings = self.dropout(embeddings)
|
||||||
return embeddings
|
return embeddings
|
||||||
@@ -271,7 +282,7 @@ class BertAttention(nn.Module):
|
|||||||
if len(heads) == 0:
|
if len(heads) == 0:
|
||||||
return
|
return
|
||||||
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
||||||
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
# Compute how many pruned heads are before the head and move the index accordingly
|
# Compute how many pruned heads are before the head and move the index accordingly
|
||||||
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
@@ -550,6 +561,10 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
**encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``:
|
**encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
|
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
|
||||||
is configured as a decoder.
|
is configured as a decoder.
|
||||||
@@ -586,7 +601,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertModel.from_pretrained('bert-base-uncased')
|
model = BertModel.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
@@ -615,8 +630,8 @@ class BertModel(BertPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
|
||||||
head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
|
head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None):
|
||||||
""" Forward pass on the Model.
|
""" Forward pass on the Model.
|
||||||
|
|
||||||
The model can behave as an encoder (with only self-attention) as well
|
The model can behave as an encoder (with only self-attention) as well
|
||||||
@@ -632,29 +647,40 @@ class BertModel(BertPreTrainedModel):
|
|||||||
https://arxiv.org/abs/1706.03762
|
https://arxiv.org/abs/1706.03762
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = torch.ones_like(input_ids)
|
attention_mask = torch.ones(input_shape, device=device)
|
||||||
if encoder_attention_mask is None:
|
|
||||||
encoder_attention_mask = torch.ones_like(input_ids)
|
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
token_type_ids = torch.zeros_like(input_ids)
|
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||||
|
|
||||||
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
||||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||||
if attention_mask.dim() == 3:
|
if attention_mask.dim() == 3:
|
||||||
extended_attention_mask = attention_mask[:, None, :, :]
|
extended_attention_mask = attention_mask[:, None, :, :]
|
||||||
|
elif attention_mask.dim() == 2:
|
||||||
# Provided a padding mask of dimensions [batch_size, seq_length]
|
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||||
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||||
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if attention_mask.dim() == 2:
|
|
||||||
if self.config.is_decoder:
|
if self.config.is_decoder:
|
||||||
batch_size, seq_length = input_ids.size()
|
batch_size, seq_length = input_shape
|
||||||
seq_ids = torch.arange(seq_length, device=input_ids.device)
|
seq_ids = torch.arange(seq_length, device=device)
|
||||||
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
|
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
|
||||||
|
causal_mask = causal_mask.to(torch.long) # not converting to long will cause errors with pytorch version < 1.3
|
||||||
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
||||||
else:
|
else:
|
||||||
extended_attention_mask = attention_mask[:, None, None, :]
|
extended_attention_mask = attention_mask[:, None, None, :]
|
||||||
|
else:
|
||||||
|
raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
|
||||||
|
|
||||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
# masked positions, this operation will create a tensor which is 0.0 for
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
@@ -666,13 +692,24 @@ class BertModel(BertPreTrainedModel):
|
|||||||
|
|
||||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if encoder_attention_mask.dim() == 3:
|
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||||
if encoder_attention_mask.dim() == 2:
|
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
if encoder_attention_mask is None:
|
||||||
|
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
|
||||||
|
|
||||||
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
if encoder_attention_mask.dim() == 3:
|
||||||
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||||
|
elif encoder_attention_mask.dim() == 2:
|
||||||
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
||||||
|
else:
|
||||||
|
raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
|
||||||
|
encoder_attention_mask.shape))
|
||||||
|
|
||||||
|
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
|
||||||
|
else:
|
||||||
|
encoder_extended_attention_mask = None
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
@@ -689,7 +726,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
head_mask = [None] * self.config.num_hidden_layers
|
head_mask = [None] * self.config.num_hidden_layers
|
||||||
|
|
||||||
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
|
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
|
||||||
encoder_outputs = self.encoder(embedding_output,
|
encoder_outputs = self.encoder(embedding_output,
|
||||||
attention_mask=extended_attention_mask,
|
attention_mask=extended_attention_mask,
|
||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
@@ -738,7 +775,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForPreTraining.from_pretrained('bert-base-uncased')
|
model = BertForPreTraining.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
prediction_scores, seq_relationship_scores = outputs[:2]
|
prediction_scores, seq_relationship_scores = outputs[:2]
|
||||||
|
|
||||||
@@ -754,14 +791,15 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.cls.predictions.decoder
|
return self.cls.predictions.decoder
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
masked_lm_labels=None, next_sentence_label=None):
|
masked_lm_labels=None, next_sentence_label=None):
|
||||||
|
|
||||||
outputs = self.bert(input_ids,
|
outputs = self.bert(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
sequence_output, pooled_output = outputs[:2]
|
sequence_output, pooled_output = outputs[:2]
|
||||||
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
|
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
|
||||||
@@ -813,7 +851,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||||
loss, prediction_scores = outputs[:2]
|
loss, prediction_scores = outputs[:2]
|
||||||
|
|
||||||
@@ -829,7 +867,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.cls.predictions.decoder
|
return self.cls.predictions.decoder
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
|
masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
|
||||||
|
|
||||||
outputs = self.bert(input_ids,
|
outputs = self.bert(input_ids,
|
||||||
@@ -837,6 +875,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
encoder_hidden_states=encoder_hidden_states,
|
encoder_hidden_states=encoder_hidden_states,
|
||||||
encoder_attention_mask=encoder_attention_mask)
|
encoder_attention_mask=encoder_attention_mask)
|
||||||
|
|
||||||
@@ -895,7 +934,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
|
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids)
|
outputs = model(input_ids)
|
||||||
seq_relationship_scores = outputs[0]
|
seq_relationship_scores = outputs[0]
|
||||||
|
|
||||||
@@ -908,14 +947,15 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
next_sentence_label=None):
|
next_sentence_label=None):
|
||||||
|
|
||||||
outputs = self.bert(input_ids,
|
outputs = self.bert(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
@@ -959,7 +999,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss, logits = outputs[:2]
|
loss, logits = outputs[:2]
|
||||||
@@ -975,14 +1015,15 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None, labels=None):
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
outputs = self.bert(input_ids,
|
outputs = self.bert(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
@@ -1034,7 +1075,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
|
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
|
||||||
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
|
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
|
||||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||||
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss, classification_scores = outputs[:2]
|
loss, classification_scores = outputs[:2]
|
||||||
@@ -1049,8 +1090,8 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None, labels=None):
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
num_choices = input_ids.shape[1]
|
num_choices = input_ids.shape[1]
|
||||||
|
|
||||||
input_ids = input_ids.view(-1, input_ids.size(-1))
|
input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||||
@@ -1062,7 +1103,8 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
@@ -1107,7 +1149,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
|
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, labels=labels)
|
outputs = model(input_ids, labels=labels)
|
||||||
loss, scores = outputs[:2]
|
loss, scores = outputs[:2]
|
||||||
@@ -1123,14 +1165,15 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None, labels=None):
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
outputs = self.bert(input_ids,
|
outputs = self.bert(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
@@ -1207,14 +1250,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
start_positions=None, end_positions=None):
|
start_positions=None, end_positions=None):
|
||||||
|
|
||||||
outputs = self.bert(input_ids,
|
outputs = self.bert(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
|||||||
293
transformers/modeling_camembert.py
Normal file
293
transformers/modeling_camembert.py
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""PyTorch CamemBERT model. """
|
||||||
|
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
|
||||||
|
from .configuration_camembert import CamembertConfig
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
CAMEMBERT_START_DOCSTRING = r""" The CamemBERT model was proposed in
|
||||||
|
`CamemBERT: a Tasty French Language Model`_
|
||||||
|
by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019.
|
||||||
|
|
||||||
|
It is a model trained on 138GB of French text.
|
||||||
|
|
||||||
|
This implementation is the same as RoBERTa.
|
||||||
|
|
||||||
|
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
|
||||||
|
refer to the PyTorch documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`CamemBERT: a Tasty French Language Model`:
|
||||||
|
https://arxiv.org/abs/1911.03894
|
||||||
|
|
||||||
|
.. _`torch.nn.Module`:
|
||||||
|
https://pytorch.org/docs/stable/nn.html#module
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the
|
||||||
|
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
CAMEMBERT_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, CamemBERT input sequence should be formatted with <s> and </s> tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: <s> the dog is hairy . </s>``
|
||||||
|
|
||||||
|
Fully encoded sequences or sequence pairs can be obtained using the CamembertTokenizer.encode function with
|
||||||
|
the ``add_special_tokens`` parameter set to ``True``.
|
||||||
|
|
||||||
|
CamemBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Optional segment token indices to indicate first and second portions of the inputs.
|
||||||
|
This embedding matrice is not trained (not pretrained during CamemBERT pretraining), you will have to train it
|
||||||
|
during finetuning.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
||||||
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
|
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
|
||||||
|
class CamembertModel(RobertaModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
|
||||||
|
Last layer hidden-state of the first token of the sequence (classification token)
|
||||||
|
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||||
|
layer weights are trained from the next sentence prediction (classification)
|
||||||
|
eo match pre-training, CamemBERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
|
objective during Bert pretraining. This output is usually *not* a good summary
|
||||||
|
of the semantic content of the input, you're often better with averaging or pooling
|
||||||
|
the sequence of hidden-states for the whole input sequence.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||||
|
model = CamembertModel.from_pretrained('camembert-base')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !")).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
config_class = CamembertConfig
|
||||||
|
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""CamemBERT Model with a `language modeling` head on top. """,
|
||||||
|
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
|
||||||
|
class CamembertForMaskedLM(RobertaForMaskedLM):
|
||||||
|
r"""
|
||||||
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the masked language modeling loss.
|
||||||
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
|
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Masked language modeling loss.
|
||||||
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||||
|
model = CamembertForMaskedLM.from_pretrained('camembert-base')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !")).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||||
|
loss, prediction_scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
config_class = CamembertConfig
|
||||||
|
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
|
||||||
|
on top of the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
|
||||||
|
class CamembertForSequenceClassification(RobertaForSequenceClassification):
|
||||||
|
r"""
|
||||||
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
|
Labels for computing the sequence classification/regression loss.
|
||||||
|
Indices should be in ``[0, ..., config.num_labels]``.
|
||||||
|
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||||
|
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Classification (or regression if config.num_labels==1) loss.
|
||||||
|
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||||
|
model = CamembertForSequenceClassification.from_pretrained('camembert-base')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !")).unsqueeze(0) # Batch size 1
|
||||||
|
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, labels=labels)
|
||||||
|
loss, logits = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
config_class = CamembertConfig
|
||||||
|
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
|
||||||
|
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
|
||||||
|
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
|
||||||
|
class CamembertForMultipleChoice(RobertaForMultipleChoice):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Classification loss.
|
||||||
|
**classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
|
||||||
|
of the input tensors. (see `input_ids` above).
|
||||||
|
Classification scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||||
|
model = CamembertForMultipleChoice.from_pretrained('camembert-base')
|
||||||
|
choices = ["J'aime le camembert !", "Je deteste le camembert !"]
|
||||||
|
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||||
|
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, labels=labels)
|
||||||
|
loss, classification_scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
config_class = CamembertConfig
|
||||||
|
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of
|
||||||
|
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||||
|
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
|
||||||
|
class CamembertForTokenClassification(RobertaForTokenClassification):
|
||||||
|
r"""
|
||||||
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the token classification loss.
|
||||||
|
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Classification loss.
|
||||||
|
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||||
|
Classification scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||||
|
model = CamembertForTokenClassification.from_pretrained('camembert-base')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||||
|
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, labels=labels)
|
||||||
|
loss, scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
config_class = CamembertConfig
|
||||||
|
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
@@ -63,7 +63,8 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
|
|||||||
scaled_attention_logits = matmul_qk / np.sqrt(dk)
|
scaled_attention_logits = matmul_qk / np.sqrt(dk)
|
||||||
|
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
scaled_attention_logits += (mask * -1e4)
|
nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
|
||||||
|
scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4)
|
||||||
|
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
# Apply the attention mask
|
# Apply the attention mask
|
||||||
@@ -220,7 +221,8 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
(see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
|
should not be passed as input ids as they have already been computed.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -236,6 +238,10 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
@@ -246,9 +252,10 @@ class CTRLModel(CTRLPreTrainedModel):
|
|||||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
|
should not be passed as input ids as they have already been computed.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
@@ -302,17 +309,26 @@ class CTRLModel(CTRLPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
|
||||||
input_shape = input_ids.size()
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
input_ids = input_ids.view(-1, input_shape[-1])
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
input_ids = input_ids.view(-1, input_shape[-1])
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
if past is None:
|
if past is None:
|
||||||
past_length = 0
|
past_length = 0
|
||||||
past = [None] * len(self.h)
|
past = [None] * len(self.h)
|
||||||
else:
|
else:
|
||||||
past_length = past[0][0].size(-2)
|
past_length = past[0][0].size(-2)
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
|
||||||
|
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
|
||||||
|
|
||||||
# Attention mask.
|
# Attention mask.
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -354,10 +370,11 @@ class CTRLModel(CTRLPreTrainedModel):
|
|||||||
token_type_embeds = 0
|
token_type_embeds = 0
|
||||||
position_ids = position_ids.view(-1, input_shape[-1])
|
position_ids = position_ids.view(-1, input_shape[-1])
|
||||||
|
|
||||||
inputs_embeds = self.w(input_ids)
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = self.w(input_ids)
|
||||||
# inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
|
# inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
|
||||||
seq_len = input_ids.shape[-1]
|
seq_len = input_shape[-1]
|
||||||
mask = torch.triu(torch.ones(seq_len, seq_len), 1).to(inputs_embeds.device)
|
mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(inputs_embeds.device)
|
||||||
|
|
||||||
inputs_embeds *= np.sqrt(self.d_model_size)
|
inputs_embeds *= np.sqrt(self.d_model_size)
|
||||||
|
|
||||||
@@ -421,9 +438,10 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
|||||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
|
should not be passed as input ids as they have already been computed.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
@@ -455,14 +473,15 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head
|
return self.lm_head
|
||||||
|
|
||||||
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
labels=None):
|
labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
past=past,
|
past=past,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ import numpy as np
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
from torch.nn import CrossEntropyLoss
|
||||||
|
|
||||||
from .modeling_utils import PreTrainedModel, prune_linear_layer
|
from .modeling_utils import PreTrainedModel, prune_linear_layer
|
||||||
from .configuration_distilbert import DistilBertConfig
|
from .configuration_distilbert import DistilBertConfig
|
||||||
@@ -41,7 +42,9 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
|
||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
|
||||||
|
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
|
||||||
|
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -387,6 +390,10 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
|
||||||
@@ -436,9 +443,20 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
|||||||
self.transformer.layer[layer].attention.prune_heads(heads)
|
self.transformer.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self,
|
def forward(self,
|
||||||
input_ids, attention_mask=None, head_mask=None):
|
input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = torch.ones_like(input_ids) # (bs, seq_length)
|
attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length)
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
@@ -455,8 +473,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
head_mask = [None] * self.config.num_hidden_layers
|
head_mask = [None] * self.config.num_hidden_layers
|
||||||
|
|
||||||
embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim)
|
if inputs_embeds is None:
|
||||||
tfmr_output = self.transformer(x=embedding_output,
|
inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim)
|
||||||
|
tfmr_output = self.transformer(x=inputs_embeds,
|
||||||
attn_mask=attention_mask,
|
attn_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_state = tfmr_output[0]
|
hidden_state = tfmr_output[0]
|
||||||
@@ -514,10 +533,11 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.vocab_projector
|
return self.vocab_projector
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None):
|
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None):
|
||||||
dlbrt_output = self.distilbert(input_ids=input_ids,
|
dlbrt_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
||||||
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
||||||
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
|
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
|
||||||
@@ -578,10 +598,11 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, head_mask=None, labels=None):
|
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
||||||
pooled_output = hidden_state[:, 0] # (bs, dim)
|
pooled_output = hidden_state[:, 0] # (bs, dim)
|
||||||
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
||||||
@@ -652,10 +673,11 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None):
|
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
|
||||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
||||||
|
|
||||||
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
|
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
|
||||||
@@ -683,3 +705,75 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
|||||||
outputs = (total_loss,) + outputs
|
outputs = (total_loss,) + outputs
|
||||||
|
|
||||||
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
|
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
|
||||||
|
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||||
|
DISTILBERT_START_DOCSTRING,
|
||||||
|
DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the token classification loss.
|
||||||
|
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Classification loss.
|
||||||
|
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||||
|
Classification scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||||
|
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
|
||||||
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||||
|
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||||
|
outputs = model(input_ids, labels=labels)
|
||||||
|
loss, scores = outputs[:2]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config):
|
||||||
|
super(DistilBertForTokenClassification, self).__init__(config)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.distilbert = DistilBertModel(config)
|
||||||
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
|
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids=None, attention_mask=None, head_mask=None,
|
||||||
|
inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
|
outputs = self.distilbert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
sequence_output = self.dropout(sequence_output)
|
||||||
|
logits = self.classifier(sequence_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
if labels is not None:
|
||||||
|
loss_fct = CrossEntropyLoss()
|
||||||
|
# Only keep active parts of the loss
|
||||||
|
if attention_mask is not None:
|
||||||
|
active_loss = attention_mask.view(-1) == 1
|
||||||
|
active_logits = logits.view(-1, self.num_labels)[active_loss]
|
||||||
|
active_labels = labels.view(-1)[active_loss]
|
||||||
|
loss = loss_fct(active_logits, active_labels)
|
||||||
|
else:
|
||||||
|
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||||
|
outputs = (loss,) + outputs
|
||||||
|
|
||||||
|
return outputs # (loss), scores, (hidden_states), (attentions)
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ logger = logging.getLogger(__name__)
|
|||||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
||||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
||||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
|
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
|
||||||
|
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
|
||||||
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
|
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
|
||||||
|
|
||||||
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||||
@@ -297,7 +298,8 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
(see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
|
should not be passed as input ids as they have already been computed.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -313,6 +315,10 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
@@ -323,9 +329,10 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
|
should not be passed as input ids as they have already been computed.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
@@ -370,9 +377,17 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
|
||||||
input_shape = input_ids.size()
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
input_ids = input_ids.view(-1, input_shape[-1])
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
input_ids = input_ids.view(-1, input_shape[-1])
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
token_type_ids = token_type_ids.view(-1, input_shape[-1])
|
token_type_ids = token_type_ids.view(-1, input_shape[-1])
|
||||||
if position_ids is not None:
|
if position_ids is not None:
|
||||||
@@ -384,8 +399,9 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
past_length = past[0][0].size(-2)
|
past_length = past[0][0].size(-2)
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
|
||||||
|
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
|
||||||
|
|
||||||
# Attention mask.
|
# Attention mask.
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -419,7 +435,8 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
head_mask = [None] * self.config.n_layer
|
head_mask = [None] * self.config.n_layer
|
||||||
|
|
||||||
inputs_embeds = self.wte(input_ids)
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = self.wte(input_ids)
|
||||||
position_embeds = self.wpe(position_ids)
|
position_embeds = self.wpe(position_ids)
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
token_type_embeds = self.wte(token_type_ids)
|
token_type_embeds = self.wte(token_type_ids)
|
||||||
@@ -486,9 +503,10 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
|
should not be passed as input ids as they have already been computed.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
@@ -520,14 +538,15 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head
|
return self.lm_head
|
||||||
|
|
||||||
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
labels=None):
|
labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
past=past,
|
past=past,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
@@ -577,9 +596,10 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
|
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
|
||||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||||
|
should not be passed as input ids as they have already been computed.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
@@ -623,14 +643,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head
|
return self.lm_head
|
||||||
|
|
||||||
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
mc_token_ids=None, lm_labels=None, mc_labels=None):
|
mc_token_ids=None, lm_labels=None, mc_labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
past=past,
|
past=past,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
|||||||
@@ -50,8 +50,10 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
|||||||
|
|
||||||
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
|
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
|
||||||
|
|
||||||
names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
|
with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
|
||||||
shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
|
names = json.load(names_handle)
|
||||||
|
with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
|
||||||
|
shapes = json.load(shapes_handle)
|
||||||
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
||||||
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
|
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
|
||||||
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
||||||
@@ -322,6 +324,10 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
|
||||||
@@ -373,14 +379,22 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
input_ids = input_ids.view(-1, input_shape[-1])
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
# This was used when we had a single embedding matrice from position and token embeddings
|
# Code is different from when we had a single embedding matrice from position and token embeddings
|
||||||
# start = self.config.vocab_size + self.config.n_special
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
# end = start + input_ids.size(-1)
|
position_ids = torch.arange(input_shape[-1], dtype=torch.long, device=device)
|
||||||
# position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
|
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
|
||||||
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
|
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
|
||||||
|
|
||||||
# Attention mask.
|
# Attention mask.
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -413,11 +427,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
head_mask = [None] * self.config.n_layer
|
head_mask = [None] * self.config.n_layer
|
||||||
|
|
||||||
input_shape = input_ids.size()
|
if inputs_embeds is None:
|
||||||
input_ids = input_ids.view(-1, input_ids.size(-1))
|
inputs_embeds = self.tokens_embed(input_ids)
|
||||||
position_ids = position_ids.view(-1, position_ids.size(-1))
|
|
||||||
|
|
||||||
inputs_embeds = self.tokens_embed(input_ids)
|
|
||||||
position_embeds = self.positions_embed(position_ids)
|
position_embeds = self.positions_embed(position_ids)
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
|
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
|
||||||
@@ -495,13 +506,14 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head
|
return self.lm_head
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
labels=None):
|
labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
|
|
||||||
@@ -587,13 +599,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head
|
return self.lm_head
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
mc_token_ids=None, lm_labels=None, mc_labels=None):
|
mc_token_ids=None, lm_labels=None, mc_labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
|
|||||||
@@ -35,6 +35,8 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
|
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
|
||||||
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
|
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
|
||||||
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
|
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
|
||||||
|
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
|
||||||
|
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
|
|
||||||
class RobertaEmbeddings(BertEmbeddings):
|
class RobertaEmbeddings(BertEmbeddings):
|
||||||
@@ -48,16 +50,24 @@ class RobertaEmbeddings(BertEmbeddings):
|
|||||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
|
||||||
padding_idx=self.padding_idx)
|
padding_idx=self.padding_idx)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, position_ids=None):
|
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||||
seq_length = input_ids.size(1)
|
if input_ids is not None:
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
else:
|
||||||
|
input_shape = inputs_embeds.size()[:-1]
|
||||||
|
|
||||||
|
seq_length = input_shape[1]
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
# Position numbers begin at padding_idx+1. Padding symbols are ignored.
|
# Position numbers begin at padding_idx+1. Padding symbols are ignored.
|
||||||
# cf. fairseq's `utils.make_positions`
|
# cf. fairseq's `utils.make_positions`
|
||||||
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
|
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=device)
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = position_ids.unsqueeze(0).expand(input_shape)
|
||||||
return super(RobertaEmbeddings, self).forward(input_ids,
|
return super(RobertaEmbeddings, self).forward(input_ids,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids)
|
position_ids=position_ids,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
|
|
||||||
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
||||||
@@ -126,6 +136,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
@@ -222,13 +236,14 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
|||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head.decoder
|
return self.lm_head.decoder
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
masked_lm_labels=None):
|
masked_lm_labels=None):
|
||||||
outputs = self.roberta(input_ids,
|
outputs = self.roberta(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
prediction_scores = self.lm_head(sequence_output)
|
prediction_scores = self.lm_head(sequence_output)
|
||||||
|
|
||||||
@@ -309,13 +324,14 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
|||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
self.classifier = RobertaClassificationHead(config)
|
self.classifier = RobertaClassificationHead(config)
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||||
labels=None):
|
labels=None):
|
||||||
outputs = self.roberta(input_ids,
|
outputs = self.roberta(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
logits = self.classifier(sequence_output)
|
logits = self.classifier(sequence_output)
|
||||||
|
|
||||||
@@ -372,6 +388,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
Labels for computing the multiple choice classification loss.
|
Labels for computing the multiple choice classification loss.
|
||||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||||
@@ -415,8 +435,8 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None,
|
||||||
position_ids=None, head_mask=None):
|
position_ids=None, head_mask=None, inputs_embeds=None):
|
||||||
num_choices = input_ids.shape[1]
|
num_choices = input_ids.shape[1]
|
||||||
|
|
||||||
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||||
@@ -487,14 +507,15 @@ class RobertaForTokenClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None, labels=None):
|
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||||
|
|
||||||
outputs = self.roberta(input_ids,
|
outputs = self.roberta(input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
position_ids=position_ids,
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask,
|
||||||
|
inputs_embeds=inputs_embeds)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
|||||||
794
transformers/modeling_tf_albert.py
Normal file
794
transformers/modeling_tf_albert.py
Normal file
@@ -0,0 +1,794 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" TF 2.0 ALBERT model. """
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .configuration_albert import AlbertConfig
|
||||||
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||||
|
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
|
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
|
||||||
|
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
|
||||||
|
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
|
||||||
|
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
|
||||||
|
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
|
||||||
|
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
|
||||||
|
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
|
||||||
|
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
||||||
|
"""Construct the embeddings from word, position and token_type embeddings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertEmbeddings, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
|
||||||
|
config.embedding_size,
|
||||||
|
embeddings_initializer=get_initializer(
|
||||||
|
self.config.initializer_range),
|
||||||
|
name='position_embeddings')
|
||||||
|
self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
|
||||||
|
config.embedding_size,
|
||||||
|
embeddings_initializer=get_initializer(
|
||||||
|
self.config.initializer_range),
|
||||||
|
name='token_type_embeddings')
|
||||||
|
|
||||||
|
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
|
||||||
|
# any TensorFlow checkpoint file
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
"""Build shared word embedding layer """
|
||||||
|
with tf.name_scope("word_embeddings"):
|
||||||
|
# Create and initialize weights. The random normal initializer was chosen
|
||||||
|
# arbitrarily, and works well.
|
||||||
|
self.word_embeddings = self.add_weight(
|
||||||
|
"weight",
|
||||||
|
shape=[self.config.vocab_size, self.config.embedding_size],
|
||||||
|
initializer=get_initializer(self.config.initializer_range))
|
||||||
|
super(TFAlbertEmbeddings, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, inputs, mode="embedding", training=False):
|
||||||
|
"""Get token embeddings of inputs.
|
||||||
|
Args:
|
||||||
|
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
|
||||||
|
mode: string, a valid value is one of "embedding" and "linear".
|
||||||
|
Returns:
|
||||||
|
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
|
||||||
|
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
|
||||||
|
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||||
|
Raises:
|
||||||
|
ValueError: if mode is not valid.
|
||||||
|
|
||||||
|
Shared weights logic adapted from
|
||||||
|
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||||
|
"""
|
||||||
|
if mode == "embedding":
|
||||||
|
return self._embedding(inputs, training=training)
|
||||||
|
elif mode == "linear":
|
||||||
|
return self._linear(inputs)
|
||||||
|
else:
|
||||||
|
raise ValueError("mode {} is not valid.".format(mode))
|
||||||
|
|
||||||
|
def _embedding(self, inputs, training=False):
|
||||||
|
"""Applies embedding based on inputs tensor."""
|
||||||
|
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||||
|
|
||||||
|
if input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
else:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
|
||||||
|
seq_length = input_shape[1]
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
if token_type_ids is None:
|
||||||
|
token_type_ids = tf.fill(input_shape, 0)
|
||||||
|
|
||||||
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
|
||||||
|
position_embeddings = self.position_embeddings(position_ids)
|
||||||
|
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||||
|
|
||||||
|
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
|
||||||
|
embeddings = self.LayerNorm(embeddings)
|
||||||
|
embeddings = self.dropout(embeddings, training=training)
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
def _linear(self, inputs):
|
||||||
|
"""Computes logits by running inputs through a linear layer.
|
||||||
|
Args:
|
||||||
|
inputs: A float32 tensor with shape [batch_size, length, embedding_size]
|
||||||
|
Returns:
|
||||||
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
|
"""
|
||||||
|
batch_size = shape_list(inputs)[0]
|
||||||
|
length = shape_list(inputs)[1]
|
||||||
|
x = tf.reshape(inputs, [-1, self.config.embedding_size])
|
||||||
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
|
return tf.reshape(logits, [batch_size, length, self.config.vocab_size])
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertSelfAttention(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertSelfAttention, self).__init__(**kwargs)
|
||||||
|
if config.hidden_size % config.num_attention_heads != 0:
|
||||||
|
raise ValueError(
|
||||||
|
"The hidden size (%d) is not a multiple of the number of attention "
|
||||||
|
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
|
self.num_attention_heads = config.num_attention_heads
|
||||||
|
assert config.hidden_size % config.num_attention_heads == 0
|
||||||
|
self.attention_head_size = int(
|
||||||
|
config.hidden_size / config.num_attention_heads)
|
||||||
|
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||||
|
|
||||||
|
self.query = tf.keras.layers.Dense(self.all_head_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='query')
|
||||||
|
self.key = tf.keras.layers.Dense(self.all_head_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='key')
|
||||||
|
self.value = tf.keras.layers.Dense(self.all_head_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='value')
|
||||||
|
|
||||||
|
self.dropout = tf.keras.layers.Dropout(
|
||||||
|
config.attention_probs_dropout_prob)
|
||||||
|
|
||||||
|
def transpose_for_scores(self, x, batch_size):
|
||||||
|
x = tf.reshape(
|
||||||
|
x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||||
|
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
batch_size = shape_list(hidden_states)[0]
|
||||||
|
mixed_query_layer = self.query(hidden_states)
|
||||||
|
mixed_key_layer = self.key(hidden_states)
|
||||||
|
mixed_value_layer = self.value(hidden_states)
|
||||||
|
|
||||||
|
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||||
|
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||||
|
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||||
|
|
||||||
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
|
# (batch size, num_heads, seq_len_q, seq_len_k)
|
||||||
|
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
|
||||||
|
# scale attention_scores
|
||||||
|
dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
|
||||||
|
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
|
# Normalize the attention scores to probabilities.
|
||||||
|
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||||
|
|
||||||
|
# This is actually dropping out entire tokens to attend to, which might
|
||||||
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||||
|
attention_probs = self.dropout(attention_probs, training=training)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
attention_probs = attention_probs * head_mask
|
||||||
|
|
||||||
|
context_layer = tf.matmul(attention_probs, value_layer)
|
||||||
|
|
||||||
|
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||||
|
context_layer = tf.reshape(context_layer,
|
||||||
|
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||||
|
|
||||||
|
outputs = (context_layer, attention_probs) if self.output_attentions else (
|
||||||
|
context_layer,)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertSelfOutput(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertSelfOutput, self).__init__(**kwargs)
|
||||||
|
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='dense')
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, input_tensor = inputs
|
||||||
|
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.dropout(hidden_states, training=training)
|
||||||
|
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertAttention(TFBertSelfAttention):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertAttention, self).__init__(config, **kwargs)
|
||||||
|
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='dense')
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
def prune_heads(self, heads):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
input_tensor, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
batch_size = shape_list(input_tensor)[0]
|
||||||
|
mixed_query_layer = self.query(input_tensor)
|
||||||
|
mixed_key_layer = self.key(input_tensor)
|
||||||
|
mixed_value_layer = self.value(input_tensor)
|
||||||
|
|
||||||
|
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||||
|
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||||
|
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||||
|
|
||||||
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
|
# (batch size, num_heads, seq_len_q, seq_len_k)
|
||||||
|
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
|
||||||
|
# scale attention_scores
|
||||||
|
dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
|
||||||
|
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
|
||||||
|
attention_scores = attention_scores + attention_mask
|
||||||
|
|
||||||
|
# Normalize the attention scores to probabilities.
|
||||||
|
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||||
|
|
||||||
|
# This is actually dropping out entire tokens to attend to, which might
|
||||||
|
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||||
|
attention_probs = self.dropout(attention_probs, training=training)
|
||||||
|
|
||||||
|
# Mask heads if we want to
|
||||||
|
if head_mask is not None:
|
||||||
|
attention_probs = attention_probs * head_mask
|
||||||
|
|
||||||
|
context_layer = tf.matmul(attention_probs, value_layer)
|
||||||
|
|
||||||
|
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||||
|
context_layer = tf.reshape(context_layer,
|
||||||
|
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||||
|
|
||||||
|
self_outputs = (context_layer, attention_probs) if self.output_attentions else (
|
||||||
|
context_layer,)
|
||||||
|
|
||||||
|
hidden_states = self_outputs[0]
|
||||||
|
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.dropout(hidden_states, training=training)
|
||||||
|
attention_output = self.LayerNorm(hidden_states + input_tensor)
|
||||||
|
|
||||||
|
# add attentions if we output them
|
||||||
|
outputs = (attention_output,) + self_outputs[1:]
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertLayer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertLayer, self).__init__(**kwargs)
|
||||||
|
self.attention = TFAlbertAttention(config, name='attention')
|
||||||
|
|
||||||
|
self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), name='ffn')
|
||||||
|
|
||||||
|
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
else:
|
||||||
|
self.activation = config.hidden_act
|
||||||
|
|
||||||
|
self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), name='ffn_output')
|
||||||
|
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
attention_outputs = self.attention(
|
||||||
|
[hidden_states, attention_mask, head_mask], training=training)
|
||||||
|
ffn_output = self.ffn(attention_outputs[0])
|
||||||
|
ffn_output = self.activation(ffn_output)
|
||||||
|
ffn_output = self.ffn_output(ffn_output)
|
||||||
|
|
||||||
|
hidden_states = self.dropout(hidden_states, training=training)
|
||||||
|
hidden_states = self.full_layer_layer_norm(
|
||||||
|
ffn_output + attention_outputs[0])
|
||||||
|
|
||||||
|
# add attentions if we output them
|
||||||
|
outputs = (hidden_states,) + attention_outputs[1:]
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertLayerGroup(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertLayerGroup, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
|
||||||
|
i)) for i in range(config.inner_group_num)]
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
layer_hidden_states = ()
|
||||||
|
layer_attentions = ()
|
||||||
|
|
||||||
|
for layer_index, albert_layer in enumerate(self.albert_layers):
|
||||||
|
layer_output = albert_layer(
|
||||||
|
[hidden_states, attention_mask, head_mask[layer_index]], training=training)
|
||||||
|
hidden_states = layer_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
layer_attentions = layer_attentions + (layer_output[1],)
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
layer_hidden_states = layer_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (layer_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (layer_attentions,)
|
||||||
|
# last-layer hidden state, (layer hidden states), (layer attentions)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertTransformer(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertTransformer, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
self.config = config
|
||||||
|
self.output_attentions = config.output_attentions
|
||||||
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), name='embedding_hidden_mapping_in')
|
||||||
|
self.albert_layer_groups = [TFAlbertLayerGroup(
|
||||||
|
config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
|
||||||
|
|
||||||
|
def call(self, inputs, training=False):
|
||||||
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
|
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
|
||||||
|
all_attentions = ()
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = (hidden_states,)
|
||||||
|
|
||||||
|
for i in range(self.config.num_hidden_layers):
|
||||||
|
# Number of layers in a hidden group
|
||||||
|
layers_per_group = int(
|
||||||
|
self.config.num_hidden_layers / self.config.num_hidden_groups)
|
||||||
|
|
||||||
|
# Index of the hidden group
|
||||||
|
group_idx = int(
|
||||||
|
i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
|
||||||
|
|
||||||
|
layer_group_output = self.albert_layer_groups[group_idx](
|
||||||
|
[hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
|
||||||
|
hidden_states = layer_group_output[0]
|
||||||
|
|
||||||
|
if self.output_attentions:
|
||||||
|
all_attentions = all_attentions + layer_group_output[-1]
|
||||||
|
|
||||||
|
if self.output_hidden_states:
|
||||||
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
if self.output_hidden_states:
|
||||||
|
outputs = outputs + (all_hidden_states,)
|
||||||
|
if self.output_attentions:
|
||||||
|
outputs = outputs + (all_attentions,)
|
||||||
|
|
||||||
|
# last-layer hidden state, (all hidden states), (all attentions)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertPreTrainedModel(TFPreTrainedModel):
|
||||||
|
""" An abstract class to handle weights initialization and
|
||||||
|
a simple interface for dowloading and loading pretrained models.
|
||||||
|
"""
|
||||||
|
config_class = AlbertConfig
|
||||||
|
pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
base_model_prefix = "albert"
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertMLMHead(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, config, input_embeddings, **kwargs):
|
||||||
|
super(TFAlbertMLMHead, self).__init__(**kwargs)
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
|
||||||
|
self.dense = tf.keras.layers.Dense(config.embedding_size,
|
||||||
|
kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range),
|
||||||
|
name='dense')
|
||||||
|
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||||
|
self.activation = ACT2FN[config.hidden_act]
|
||||||
|
else:
|
||||||
|
self.activation = config.hidden_act
|
||||||
|
|
||||||
|
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||||
|
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||||
|
|
||||||
|
# The output weights are the same as the input embeddings, but there is
|
||||||
|
# an output-only bias for each token.
|
||||||
|
self.decoder = input_embeddings
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
self.bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='bias')
|
||||||
|
self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
|
||||||
|
initializer='zeros',
|
||||||
|
trainable=True,
|
||||||
|
name='decoder/bias')
|
||||||
|
super(TFAlbertMLMHead, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, hidden_states):
|
||||||
|
hidden_states = self.dense(hidden_states)
|
||||||
|
hidden_states = self.activation(hidden_states)
|
||||||
|
hidden_states = self.LayerNorm(hidden_states)
|
||||||
|
hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias
|
||||||
|
hidden_states = hidden_states + self.bias
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in
|
||||||
|
`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
|
||||||
|
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
|
||||||
|
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
|
||||||
|
|
||||||
|
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||||
|
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||||
|
|
||||||
|
.. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
|
||||||
|
https://arxiv.org/abs/1909.11942
|
||||||
|
|
||||||
|
.. _`tf.keras.Model`:
|
||||||
|
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||||
|
|
||||||
|
Note on the model inputs:
|
||||||
|
TF 2.0 models accepts two formats as inputs:
|
||||||
|
|
||||||
|
- having all inputs as keyword arguments (like PyTorch models), or
|
||||||
|
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||||
|
|
||||||
|
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||||
|
|
||||||
|
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||||
|
|
||||||
|
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||||
|
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||||
|
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||||
|
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||||
|
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
|
||||||
|
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||||
|
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ALBERT_INPUTS_DOCSTRING = r"""
|
||||||
|
Inputs:
|
||||||
|
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
To match pre-training, ALBERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||||
|
|
||||||
|
(a) For sequence pairs:
|
||||||
|
|
||||||
|
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||||
|
|
||||||
|
(b) For single sequences:
|
||||||
|
|
||||||
|
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||||
|
|
||||||
|
``token_type_ids: 0 0 0 0 0 0 0``
|
||||||
|
|
||||||
|
Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
|
the right rather than the left.
|
||||||
|
|
||||||
|
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
|
||||||
|
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||||
|
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
|
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `ALBERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
|
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFAlbertModel(TFAlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
|
||||||
|
Last layer hidden-state of the first token of the sequence (classification token)
|
||||||
|
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||||
|
layer weights are trained from the next sentence prediction (classification)
|
||||||
|
objective during Albert pretraining. This output is usually *not* a good summary
|
||||||
|
of the semantic content of the input, you're often better with averaging or pooling
|
||||||
|
the sequence of hidden-states for the whole input sequence.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import AlbertTokenizer, TFAlbertModel
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
model = TFAlbertModel.from_pretrained('bert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super(TFAlbertModel, self).__init__(config, **kwargs)
|
||||||
|
self.num_hidden_layers = config.num_hidden_layers
|
||||||
|
|
||||||
|
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
|
||||||
|
self.encoder = TFAlbertTransformer(config, name="encoder")
|
||||||
|
self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||||
|
config.initializer_range), activation='tanh', name='pooler')
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.embeddings
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _prune_heads(self, heads_to_prune):
|
||||||
|
""" Prunes heads of the model.
|
||||||
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
|
See base class PreTrainedModel
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
|
||||||
|
if isinstance(inputs, (tuple, list)):
|
||||||
|
input_ids = inputs[0]
|
||||||
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
|
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
||||||
|
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
||||||
|
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
||||||
|
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
elif isinstance(inputs, dict):
|
||||||
|
input_ids = inputs.get('input_ids')
|
||||||
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
|
else:
|
||||||
|
input_ids = inputs
|
||||||
|
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = tf.fill(input_shape, 1)
|
||||||
|
if token_type_ids is None:
|
||||||
|
token_type_ids = tf.fill(input_shape, 0)
|
||||||
|
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
|
||||||
|
extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
|
||||||
|
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||||
|
|
||||||
|
# Prepare head mask if needed
|
||||||
|
# 1.0 in head_mask indicate we keep the head
|
||||||
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
|
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||||
|
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||||
|
if not head_mask is None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||||
|
|
||||||
|
embedding_output = self.embeddings(
|
||||||
|
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
|
||||||
|
encoder_outputs = self.encoder(
|
||||||
|
[embedding_output, extended_attention_mask, head_mask], training=training)
|
||||||
|
|
||||||
|
sequence_output = encoder_outputs[0]
|
||||||
|
pooled_output = self.pooler(sequence_output[:, 0])
|
||||||
|
|
||||||
|
# add hidden_states and attentions if they are here
|
||||||
|
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]
|
||||||
|
# sequence_output, pooled_output, (hidden_states), (attentions)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import AlbertTokenizer, TFAlbertForMaskedLM
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
prediction_scores = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
||||||
|
|
||||||
|
self.albert = TFAlbertModel(config, name='albert')
|
||||||
|
self.predictions = TFAlbertMLMHead(
|
||||||
|
config, self.albert.embeddings, name='predictions')
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.albert.embeddings
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.albert(inputs, **kwargs)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
prediction_scores = self.predictions(
|
||||||
|
sequence_output, training=kwargs.get('training', False))
|
||||||
|
|
||||||
|
# Add hidden states and attention if they are here
|
||||||
|
outputs = (prediction_scores,) + outputs[2:]
|
||||||
|
|
||||||
|
return outputs # prediction_scores, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||||
|
the pooled output) e.g. for GLUE tasks. """,
|
||||||
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
|
||||||
|
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
|
||||||
|
|
||||||
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||||
|
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.albert = TFAlbertModel(config, name='albert')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||||
|
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='classifier')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.albert(inputs, **kwargs)
|
||||||
|
|
||||||
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
|
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
|
||||||
|
logits = self.classifier(pooled_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
|
||||||
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
@@ -112,6 +112,9 @@ class TFAutoModel(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -244,6 +247,9 @@ class TFAutoModelWithLMHead(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -369,6 +375,9 @@ class TFAutoModelForSequenceClassification(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
@@ -481,6 +490,9 @@ class TFAutoModelForQuestionAnswering(object):
|
|||||||
force_download: (`optional`) boolean, default False:
|
force_download: (`optional`) boolean, default False:
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
resume_download: (`optional`) boolean, default False:
|
||||||
|
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
The proxies are used on each request.
|
The proxies are used on each request.
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import numpy as np
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from .configuration_bert import BertConfig
|
from .configuration_bert import BertConfig
|
||||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||||
from .file_utils import add_start_docstrings
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -142,19 +142,25 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def _embedding(self, inputs, training=False):
|
def _embedding(self, inputs, training=False):
|
||||||
"""Applies embedding based on inputs tensor."""
|
"""Applies embedding based on inputs tensor."""
|
||||||
input_ids, position_ids, token_type_ids = inputs
|
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||||
|
|
||||||
seq_length = tf.shape(input_ids)[1]
|
if input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
else:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
|
||||||
|
seq_length = input_shape[1]
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
token_type_ids = tf.fill(tf.shape(input_ids), 0)
|
token_type_ids = tf.fill(input_shape, 0)
|
||||||
|
|
||||||
words_embeddings = tf.gather(self.word_embeddings, input_ids)
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
|
||||||
position_embeddings = self.position_embeddings(position_ids)
|
position_embeddings = self.position_embeddings(position_ids)
|
||||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||||
|
|
||||||
embeddings = words_embeddings + position_embeddings + token_type_embeddings
|
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
|
||||||
embeddings = self.LayerNorm(embeddings)
|
embeddings = self.LayerNorm(embeddings)
|
||||||
embeddings = self.dropout(embeddings, training=training)
|
embeddings = self.dropout(embeddings, training=training)
|
||||||
return embeddings
|
return embeddings
|
||||||
@@ -166,8 +172,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
|||||||
Returns:
|
Returns:
|
||||||
float32 tensor with shape [batch_size, length, vocab_size].
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
"""
|
"""
|
||||||
batch_size = tf.shape(inputs)[0]
|
batch_size = shape_list(inputs)[0]
|
||||||
length = tf.shape(inputs)[1]
|
length = shape_list(inputs)[1]
|
||||||
|
|
||||||
x = tf.reshape(inputs, [-1, self.hidden_size])
|
x = tf.reshape(inputs, [-1, self.hidden_size])
|
||||||
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
@@ -208,7 +214,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||||||
def call(self, inputs, training=False):
|
def call(self, inputs, training=False):
|
||||||
hidden_states, attention_mask, head_mask = inputs
|
hidden_states, attention_mask, head_mask = inputs
|
||||||
|
|
||||||
batch_size = tf.shape(hidden_states)[0]
|
batch_size = shape_list(hidden_states)[0]
|
||||||
mixed_query_layer = self.query(hidden_states)
|
mixed_query_layer = self.query(hidden_states)
|
||||||
mixed_key_layer = self.key(hidden_states)
|
mixed_key_layer = self.key(hidden_states)
|
||||||
mixed_value_layer = self.value(hidden_states)
|
mixed_value_layer = self.value(hidden_states)
|
||||||
@@ -219,7 +225,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||||
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k)
|
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k)
|
||||||
dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) # scale attention_scores
|
dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores
|
||||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||||
|
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -460,6 +466,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
|||||||
self.encoder = TFBertEncoder(config, name='encoder')
|
self.encoder = TFBertEncoder(config, name='encoder')
|
||||||
self.pooler = TFBertPooler(config, name='pooler')
|
self.pooler = TFBertPooler(config, name='pooler')
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.embeddings
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -470,28 +479,39 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
|
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
|
||||||
if isinstance(inputs, (tuple, list)):
|
if isinstance(inputs, (tuple, list)):
|
||||||
input_ids = inputs[0]
|
input_ids = inputs[0]
|
||||||
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
||||||
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
||||||
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
||||||
assert len(inputs) <= 5, "Too many inputs."
|
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
elif isinstance(inputs, dict):
|
elif isinstance(inputs, dict):
|
||||||
input_ids = inputs.get('input_ids')
|
input_ids = inputs.get('input_ids')
|
||||||
attention_mask = inputs.get('attention_mask', attention_mask)
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
position_ids = inputs.get('position_ids', position_ids)
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
head_mask = inputs.get('head_mask', head_mask)
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
assert len(inputs) <= 5, "Too many inputs."
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
else:
|
else:
|
||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = tf.fill(tf.shape(input_ids), 1)
|
attention_mask = tf.fill(input_shape, 1)
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
token_type_ids = tf.fill(tf.shape(input_ids), 0)
|
token_type_ids = tf.fill(input_shape, 0)
|
||||||
|
|
||||||
# We create a 3D attention mask from a 2D tensor mask.
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
@@ -520,7 +540,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
|||||||
head_mask = [None] * self.num_hidden_layers
|
head_mask = [None] * self.num_hidden_layers
|
||||||
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||||
|
|
||||||
embedding_output = self.embeddings([input_ids, position_ids, token_type_ids], training=training)
|
embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
|
||||||
encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
|
encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
|
||||||
|
|
||||||
sequence_output = encoder_outputs[0]
|
sequence_output = encoder_outputs[0]
|
||||||
@@ -616,6 +636,10 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
@@ -698,6 +722,9 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
|
|||||||
self.nsp = TFBertNSPHead(config, name='nsp___cls')
|
self.nsp = TFBertNSPHead(config, name='nsp___cls')
|
||||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
|
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.bert.embeddings
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
outputs = self.bert(inputs, **kwargs)
|
outputs = self.bert(inputs, **kwargs)
|
||||||
|
|
||||||
@@ -743,6 +770,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
|
|||||||
self.bert = TFBertMainLayer(config, name='bert')
|
self.bert = TFBertMainLayer(config, name='bert')
|
||||||
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
|
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.bert.embeddings
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
outputs = self.bert(inputs, **kwargs)
|
outputs = self.bert(inputs, **kwargs)
|
||||||
|
|
||||||
@@ -888,33 +918,39 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
|
|||||||
kernel_initializer=get_initializer(config.initializer_range),
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
name='classifier')
|
name='classifier')
|
||||||
|
|
||||||
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
|
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
|
||||||
if isinstance(inputs, (tuple, list)):
|
if isinstance(inputs, (tuple, list)):
|
||||||
input_ids = inputs[0]
|
input_ids = inputs[0]
|
||||||
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
||||||
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
||||||
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
||||||
assert len(inputs) <= 5, "Too many inputs."
|
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
elif isinstance(inputs, dict):
|
elif isinstance(inputs, dict):
|
||||||
input_ids = inputs.get('input_ids')
|
input_ids = inputs.get('input_ids')
|
||||||
attention_mask = inputs.get('attention_mask', attention_mask)
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
position_ids = inputs.get('position_ids', position_ids)
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
head_mask = inputs.get('head_mask', head_mask)
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
assert len(inputs) <= 5, "Too many inputs."
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
|
assert len(inputs) <= 6, "Too many inputs."
|
||||||
else:
|
else:
|
||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
num_choices = tf.shape(input_ids)[1]
|
if input_ids is not None:
|
||||||
seq_length = tf.shape(input_ids)[2]
|
num_choices = shape_list(input_ids)[1]
|
||||||
|
seq_length = shape_list(input_ids)[2]
|
||||||
|
else:
|
||||||
|
num_choices = shape_list(inputs_embeds)[1]
|
||||||
|
seq_length = shape_list(inputs_embeds)[2]
|
||||||
|
|
||||||
flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
|
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
|
||||||
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
||||||
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
|
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
|
||||||
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
|
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
|
||||||
|
|
||||||
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
|
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
|
||||||
|
|
||||||
outputs = self.bert(flat_inputs, training=training)
|
outputs = self.bert(flat_inputs, training=training)
|
||||||
|
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
def call(self, inputs, training=False):
|
def call(self, inputs, training=False):
|
||||||
v, k, q, mask, layer_past, attention_mask, head_mask = inputs
|
v, k, q, mask, layer_past, attention_mask, head_mask = inputs
|
||||||
batch_size = q.shape[0]
|
batch_size = shape_list(q)[0]
|
||||||
|
|
||||||
q = self.Wq(q)
|
q = self.Wq(q)
|
||||||
k = self.Wk(k)
|
k = self.Wk(k)
|
||||||
@@ -192,6 +192,9 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
|
|||||||
name='h_._{}'.format(i)) for i in range(config.n_layer)]
|
name='h_._{}'.format(i)) for i in range(config.n_layer)]
|
||||||
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
|
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.w
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -201,7 +204,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
|
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
|
||||||
if isinstance(inputs, (tuple, list)):
|
if isinstance(inputs, (tuple, list)):
|
||||||
input_ids = inputs[0]
|
input_ids = inputs[0]
|
||||||
past = inputs[1] if len(inputs) > 1 else past
|
past = inputs[1] if len(inputs) > 1 else past
|
||||||
@@ -209,7 +212,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
|
|||||||
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
||||||
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
||||||
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
||||||
assert len(inputs) <= 6, "Too many inputs."
|
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
|
||||||
|
assert len(inputs) <= 7, "Too many inputs."
|
||||||
elif isinstance(inputs, dict):
|
elif isinstance(inputs, dict):
|
||||||
input_ids = inputs.get('input_ids')
|
input_ids = inputs.get('input_ids')
|
||||||
past = inputs.get('past', past)
|
past = inputs.get('past', past)
|
||||||
@@ -217,12 +221,20 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
|
|||||||
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
position_ids = inputs.get('position_ids', position_ids)
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
head_mask = inputs.get('head_mask', head_mask)
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
assert len(inputs) <= 6, "Too many inputs."
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
|
assert len(inputs) <= 7, "Too many inputs."
|
||||||
else:
|
else:
|
||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
input_shape = shape_list(input_ids)
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
if past is None:
|
if past is None:
|
||||||
past_length = 0
|
past_length = 0
|
||||||
@@ -230,8 +242,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
|
|||||||
else:
|
else:
|
||||||
past_length = shape_list(past[0][0])[-2]
|
past_length = shape_list(past[0][0])[-2]
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
|
position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
position_ids = tf.tile(position_ids, [shape_list(input_ids)[0], 1])
|
position_ids = tf.tile(position_ids, [input_shape[0], 1])
|
||||||
|
|
||||||
# Attention mask.
|
# Attention mask.
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -270,8 +282,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
|
|||||||
token_type_embeds = 0
|
token_type_embeds = 0
|
||||||
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
|
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
|
||||||
|
|
||||||
inputs_embeds = self.w(input_ids, mode='embedding')
|
if inputs_embeds is None:
|
||||||
# x = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
|
inputs_embeds = self.w(input_ids, mode='embedding')
|
||||||
seq_len = input_shape[-1]
|
seq_len = input_shape[-1]
|
||||||
mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
|
mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
|
||||||
|
|
||||||
@@ -374,6 +386,10 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
@@ -384,7 +400,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
|
|||||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -446,7 +462,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
|
|||||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -476,6 +492,9 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
|
|||||||
|
|
||||||
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
|
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.lm_head.input_embeddings
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
transformer_outputs = self.transformer(inputs, **kwargs)
|
transformer_outputs = self.transformer(inputs, **kwargs)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|||||||
@@ -37,7 +37,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
|
||||||
|
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -96,7 +97,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
initializer=get_initializer(self.initializer_range))
|
initializer=get_initializer(self.initializer_range))
|
||||||
super(TFEmbeddings, self).build(input_shape)
|
super(TFEmbeddings, self).build(input_shape)
|
||||||
|
|
||||||
def call(self, inputs, mode="embedding", training=False):
|
def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
|
||||||
"""Get token embeddings of inputs.
|
"""Get token embeddings of inputs.
|
||||||
Args:
|
Args:
|
||||||
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
|
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
|
||||||
@@ -112,13 +113,13 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||||
"""
|
"""
|
||||||
if mode == "embedding":
|
if mode == "embedding":
|
||||||
return self._embedding(inputs, training=training)
|
return self._embedding(inputs, inputs_embeds=inputs_embeds, training=training)
|
||||||
elif mode == "linear":
|
elif mode == "linear":
|
||||||
return self._linear(inputs)
|
return self._linear(inputs)
|
||||||
else:
|
else:
|
||||||
raise ValueError("mode {} is not valid.".format(mode))
|
raise ValueError("mode {} is not valid.".format(mode))
|
||||||
|
|
||||||
def _embedding(self, inputs, training=False):
|
def _embedding(self, inputs, inputs_embeds=None, training=False):
|
||||||
"""
|
"""
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -136,14 +137,19 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
else:
|
else:
|
||||||
input_ids, position_ids = inputs
|
input_ids, position_ids = inputs
|
||||||
|
|
||||||
seq_length = tf.shape(input_ids)[1]
|
if input_ids is not None:
|
||||||
|
seq_length = shape_list(input_ids)[1]
|
||||||
|
else:
|
||||||
|
seq_length = shape_list(inputs_embeds)[1]
|
||||||
|
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|
||||||
word_embeddings = tf.gather(self.word_embeddings, input_ids)
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
|
||||||
position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim)
|
position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim)
|
||||||
|
|
||||||
embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim)
|
embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim)
|
||||||
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
|
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
|
||||||
embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim)
|
embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim)
|
||||||
return embeddings
|
return embeddings
|
||||||
@@ -155,8 +161,8 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
|||||||
Returns:
|
Returns:
|
||||||
float32 tensor with shape [batch_size, length, vocab_size].
|
float32 tensor with shape [batch_size, length, vocab_size].
|
||||||
"""
|
"""
|
||||||
batch_size = tf.shape(inputs)[0]
|
batch_size = shape_list(inputs)[0]
|
||||||
length = tf.shape(inputs)[1]
|
length = shape_list(inputs)[1]
|
||||||
|
|
||||||
x = tf.reshape(inputs, [-1, self.dim])
|
x = tf.reshape(inputs, [-1, self.dim])
|
||||||
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||||
@@ -398,28 +404,42 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
|
|||||||
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
|
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
|
||||||
self.transformer = TFTransformer(config, name="transformer") # Encoder
|
self.transformer = TFTransformer(config, name="transformer") # Encoder
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.embeddings
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def _prune_heads(self, heads_to_prune):
|
def _prune_heads(self, heads_to_prune):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def call(self, inputs, attention_mask=None, head_mask=None, training=False):
|
def call(self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, training=False):
|
||||||
if isinstance(inputs, (tuple, list)):
|
if isinstance(inputs, (tuple, list)):
|
||||||
input_ids = inputs[0]
|
input_ids = inputs[0]
|
||||||
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||||
head_mask = inputs[2] if len(inputs) > 2 else head_mask
|
head_mask = inputs[2] if len(inputs) > 2 else head_mask
|
||||||
assert len(inputs) <= 3, "Too many inputs."
|
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
|
||||||
|
assert len(inputs) <= 4, "Too many inputs."
|
||||||
elif isinstance(inputs, dict):
|
elif isinstance(inputs, dict):
|
||||||
input_ids = inputs.get('input_ids')
|
input_ids = inputs.get('input_ids')
|
||||||
attention_mask = inputs.get('attention_mask', attention_mask)
|
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||||
head_mask = inputs.get('head_mask', head_mask)
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
assert len(inputs) <= 3, "Too many inputs."
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
|
assert len(inputs) <= 4, "Too many inputs."
|
||||||
else:
|
else:
|
||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length)
|
attention_mask = tf.ones(input_shape) # (bs, seq_length)
|
||||||
attention_mask = tf.cast(attention_mask, dtype=tf.float32)
|
attention_mask = tf.cast(attention_mask, dtype=tf.float32)
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
@@ -432,7 +452,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
|
|||||||
else:
|
else:
|
||||||
head_mask = [None] * self.num_hidden_layers
|
head_mask = [None] * self.num_hidden_layers
|
||||||
|
|
||||||
embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim)
|
embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim)
|
||||||
tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
|
tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
|
||||||
|
|
||||||
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
|
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
|
||||||
@@ -508,6 +528,10 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
|
||||||
@@ -609,6 +633,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
|
|||||||
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
|
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
|
||||||
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
|
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.vocab_projector.input_embeddings
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
distilbert_output = self.distilbert(inputs, **kwargs)
|
distilbert_output = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
@@ -677,6 +704,53 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
|||||||
return outputs # logits, (hidden_states), (attentions)
|
return outputs # logits, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
|
||||||
|
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||||
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||||
|
Classification scores (before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
Examples::
|
||||||
|
import tensorflow as tf
|
||||||
|
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||||
|
tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
model = TFDistilBertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||||
|
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||||
|
outputs = model(input_ids)
|
||||||
|
scores = outputs[0]
|
||||||
|
"""
|
||||||
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
|
super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
||||||
|
self.num_labels = config.num_labels
|
||||||
|
|
||||||
|
self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
|
||||||
|
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||||
|
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||||
|
kernel_initializer=get_initializer(config.initializer_range),
|
||||||
|
name='classifier')
|
||||||
|
|
||||||
|
def call(self, inputs, **kwargs):
|
||||||
|
outputs = self.distilbert(inputs, **kwargs)
|
||||||
|
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
|
||||||
|
logits = self.classifier(sequence_output)
|
||||||
|
|
||||||
|
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
|
||||||
|
return outputs # scores, (hidden_states), (attentions)
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||||
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ class TFAttention(tf.keras.layers.Layer):
|
|||||||
# q, k, v have shape [batch, heads, sequence, features]
|
# q, k, v have shape [batch, heads, sequence, features]
|
||||||
w = tf.matmul(q, k, transpose_b=True)
|
w = tf.matmul(q, k, transpose_b=True)
|
||||||
if self.scale:
|
if self.scale:
|
||||||
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
|
dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
|
||||||
w = w / tf.math.sqrt(dk)
|
w = w / tf.math.sqrt(dk)
|
||||||
|
|
||||||
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
||||||
@@ -219,6 +219,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
|
|||||||
name='h_._{}'.format(i)) for i in range(config.n_layer)]
|
name='h_._{}'.format(i)) for i in range(config.n_layer)]
|
||||||
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
|
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.wte
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -228,7 +231,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
|
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
|
||||||
if isinstance(inputs, (tuple, list)):
|
if isinstance(inputs, (tuple, list)):
|
||||||
input_ids = inputs[0]
|
input_ids = inputs[0]
|
||||||
past = inputs[1] if len(inputs) > 1 else past
|
past = inputs[1] if len(inputs) > 1 else past
|
||||||
@@ -236,7 +239,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
|
|||||||
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
||||||
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
||||||
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
||||||
assert len(inputs) <= 6, "Too many inputs."
|
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
|
||||||
|
assert len(inputs) <= 7, "Too many inputs."
|
||||||
elif isinstance(inputs, dict):
|
elif isinstance(inputs, dict):
|
||||||
input_ids = inputs.get('input_ids')
|
input_ids = inputs.get('input_ids')
|
||||||
past = inputs.get('past', past)
|
past = inputs.get('past', past)
|
||||||
@@ -244,17 +248,28 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
|
|||||||
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
position_ids = inputs.get('position_ids', position_ids)
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
head_mask = inputs.get('head_mask', head_mask)
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
assert len(inputs) <= 6, "Too many inputs."
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
|
assert len(inputs) <= 7, "Too many inputs."
|
||||||
else:
|
else:
|
||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||||
|
elif input_ids is not None:
|
||||||
|
input_shape = shape_list(input_ids)
|
||||||
|
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
input_shape = shape_list(inputs_embeds)[:-1]
|
||||||
|
else:
|
||||||
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
if past is None:
|
if past is None:
|
||||||
past_length = 0
|
past_length = 0
|
||||||
past = [None] * len(self.h)
|
past = [None] * len(self.h)
|
||||||
else:
|
else:
|
||||||
past_length = shape_list(past[0][0])[-2]
|
past_length = shape_list(past[0][0])[-2]
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
|
position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
|
||||||
|
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
# We create a 3D attention mask from a 2D tensor mask.
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
@@ -286,11 +301,10 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
|
|||||||
head_mask = [None] * self.num_hidden_layers
|
head_mask = [None] * self.num_hidden_layers
|
||||||
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||||
|
|
||||||
input_shape = shape_list(input_ids)
|
|
||||||
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
|
|
||||||
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
|
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
|
||||||
|
|
||||||
inputs_embeds = self.wte(input_ids, mode='embedding')
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = self.wte(input_ids, mode='embedding')
|
||||||
position_embeds = self.wpe(position_ids)
|
position_embeds = self.wpe(position_ids)
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
|
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
|
||||||
@@ -408,6 +422,10 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||||
|
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||||
|
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||||
|
than the model's internal embedding lookup matrix.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
|
||||||
@@ -418,7 +436,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
|
|||||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
Sequence of hidden-states at the last layer of the model.
|
Sequence of hidden-states at the last layer of the model.
|
||||||
**past**:
|
**past**:
|
||||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -458,7 +476,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
|
|||||||
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -486,6 +504,9 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
|
|||||||
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
|
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
|
||||||
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||||
|
|
||||||
|
def get_output_embeddings(self):
|
||||||
|
return self.transformer.wte
|
||||||
|
|
||||||
def call(self, inputs, **kwargs):
|
def call(self, inputs, **kwargs):
|
||||||
transformer_outputs = self.transformer(inputs, **kwargs)
|
transformer_outputs = self.transformer(inputs, **kwargs)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
@@ -514,7 +535,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
|
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
|
||||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||||
**past**:
|
**past**:
|
||||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||||
Can be used (see `past` input) to speed up sequential decoding.
|
Can be used (see `past` input) to speed up sequential decoding.
|
||||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
@@ -556,7 +577,10 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
self.transformer = TFGPT2MainLayer(config, name='transformer')
|
||||||
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
|
||||||
|
|
||||||
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
|
def get_output_embeddings(self):
|
||||||
|
return self.transformer.wte
|
||||||
|
|
||||||
|
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
|
||||||
if isinstance(inputs, (tuple, list)):
|
if isinstance(inputs, (tuple, list)):
|
||||||
input_ids = inputs[0]
|
input_ids = inputs[0]
|
||||||
past = inputs[1] if len(inputs) > 1 else past
|
past = inputs[1] if len(inputs) > 1 else past
|
||||||
@@ -564,8 +588,9 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
|
||||||
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
position_ids = inputs[4] if len(inputs) > 4 else position_ids
|
||||||
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
head_mask = inputs[5] if len(inputs) > 5 else head_mask
|
||||||
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
|
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
|
||||||
assert len(inputs) <= 7, "Too many inputs."
|
mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
|
||||||
|
assert len(inputs) <= 8, "Too many inputs."
|
||||||
elif isinstance(inputs, dict):
|
elif isinstance(inputs, dict):
|
||||||
input_ids = inputs.get('input_ids')
|
input_ids = inputs.get('input_ids')
|
||||||
past = inputs.get('past', past)
|
past = inputs.get('past', past)
|
||||||
@@ -573,21 +598,25 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
|||||||
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||||
position_ids = inputs.get('position_ids', position_ids)
|
position_ids = inputs.get('position_ids', position_ids)
|
||||||
head_mask = inputs.get('head_mask', head_mask)
|
head_mask = inputs.get('head_mask', head_mask)
|
||||||
|
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||||
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
|
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
|
||||||
assert len(inputs) <= 7, "Too many inputs."
|
assert len(inputs) <= 8, "Too many inputs."
|
||||||
else:
|
else:
|
||||||
input_ids = inputs
|
input_ids = inputs
|
||||||
|
|
||||||
input_shapes = shape_list(input_ids)
|
if input_ids is not None:
|
||||||
|
input_shapes = shape_list(input_ids)
|
||||||
|
else:
|
||||||
|
input_shapes = shape_list(inputs_embeds)[:-1]
|
||||||
|
|
||||||
seq_length = input_shapes[-1]
|
seq_length = input_shapes[-1]
|
||||||
|
|
||||||
flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
|
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
|
||||||
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
||||||
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
|
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
|
||||||
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
|
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
|
||||||
|
|
||||||
flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
|
flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
|
||||||
|
|
||||||
transformer_outputs = self.transformer(flat_inputs, training=training)
|
transformer_outputs = self.transformer(flat_inputs, training=training)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user