Compare commits
234 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89fd3450a6 | ||
|
|
9fd6e7ab9f | ||
|
|
a15562e170 | ||
|
|
0287d264e9 | ||
|
|
7f522437bc | ||
|
|
3fbf301bba | ||
|
|
2dcc5a1629 | ||
|
|
7b0c99add9 | ||
|
|
31d3373bc9 | ||
|
|
fede4ef45d | ||
|
|
b6cd856b08 | ||
|
|
ff7368eb6b | ||
|
|
6ae0bb5291 | ||
|
|
819b468f70 | ||
|
|
a1c34bd286 | ||
|
|
ea86bef545 | ||
|
|
e0f867a9ba | ||
|
|
11600edc6e | ||
|
|
b6992b7b47 | ||
|
|
bdb4409ed8 | ||
|
|
0c8e823b03 | ||
|
|
0cd283522a | ||
|
|
c85b5db61a | ||
|
|
5c2b94c82a | ||
|
|
87747518e9 | ||
|
|
719cb3738d | ||
|
|
fc1fbae45d | ||
|
|
42e00cf9e1 | ||
|
|
d7a4c3252e | ||
|
|
7f006cdd87 | ||
|
|
0fd0b674e6 | ||
|
|
b65a994f59 | ||
|
|
1d438f15b3 | ||
|
|
574c5b3a72 | ||
|
|
09363f2a8b | ||
|
|
51e980ce36 | ||
|
|
206c35e9a4 | ||
|
|
f3d18c71ec | ||
|
|
d483cd8e46 | ||
|
|
d2f21f08f5 | ||
|
|
12b9cc9e26 | ||
|
|
bfe93a5a21 | ||
|
|
256086bc69 | ||
|
|
80aa87d9a3 | ||
|
|
455a4c842c | ||
|
|
7a1f174a9d | ||
|
|
c665e0fcfe | ||
|
|
9b6e3b34d9 | ||
|
|
dec8f4d6fd | ||
|
|
bc29aa67a9 | ||
|
|
f35f612280 | ||
|
|
7ca9653852 | ||
|
|
25e8389439 | ||
|
|
dc43215c01 | ||
|
|
282c276e09 | ||
|
|
803c1cc4ea | ||
|
|
7044ed6b05 | ||
|
|
cd65c41a83 | ||
|
|
69da972ace | ||
|
|
88111de07c | ||
|
|
b66e9b4433 | ||
|
|
0a2fecdf90 | ||
|
|
3871b8a107 | ||
|
|
8678ff8df5 | ||
|
|
e0caab0cf0 | ||
|
|
a600b30cc3 | ||
|
|
20c06fa37d | ||
|
|
39eb31e11e | ||
|
|
350bb6bffa | ||
|
|
82462c5cba | ||
|
|
41f35d0b3d | ||
|
|
01ad55f8cf | ||
|
|
50e615f43d | ||
|
|
f8aace6bcd | ||
|
|
8faf2e086b | ||
|
|
f7978490b2 | ||
|
|
ce5ef4b35d | ||
|
|
5dd7b677ad | ||
|
|
ca1a00a302 | ||
|
|
4e6a3172ce | ||
|
|
fd10d79b55 | ||
|
|
abe734ca1f | ||
|
|
0f5a799456 | ||
|
|
d51f72d5de | ||
|
|
306af132d7 | ||
|
|
50e6daf83a | ||
|
|
0517e7a1cb | ||
|
|
2fb9a934b4 | ||
|
|
c8731b9583 | ||
|
|
6060b2f89b | ||
|
|
07e21307b6 | ||
|
|
caf1d116a6 | ||
|
|
e7fba4bef5 | ||
|
|
fe8fb10b44 | ||
|
|
bf3dc778b8 | ||
|
|
0a74c88ac6 | ||
|
|
5f297c7be3 | ||
|
|
d9847678b3 | ||
|
|
0f8ad89206 | ||
|
|
9ce42dc540 | ||
|
|
1d15a7f278 | ||
|
|
ed2ab1c220 | ||
|
|
0ecfd17f49 | ||
|
|
50792dbdcc | ||
|
|
e7706f514b | ||
|
|
b5eb283aaa | ||
|
|
f753d4e32b | ||
|
|
75bc2a03cc | ||
|
|
1dc43e56c9 | ||
|
|
912a377e90 | ||
|
|
c9bce1811c | ||
|
|
62df4ba59a | ||
|
|
4ce5f36f78 | ||
|
|
a5fe16687b | ||
|
|
497f73c964 | ||
|
|
93e82ab424 | ||
|
|
19b7c9b0b7 | ||
|
|
fea921d382 | ||
|
|
da1e4e53fc | ||
|
|
0d8f8848d5 | ||
|
|
7f2c384c80 | ||
|
|
4d16b279e5 | ||
|
|
c513415b19 | ||
|
|
778a263f09 | ||
|
|
74d78beeb4 | ||
|
|
7f5d85347e | ||
|
|
906581ae3c | ||
|
|
b247b0d880 | ||
|
|
780f183e55 | ||
|
|
e424d2e45d | ||
|
|
1ae81e4aa1 | ||
|
|
5d29f8e99b | ||
|
|
a8ad83040d | ||
|
|
ca4baf8ca1 | ||
|
|
60c984da6c | ||
|
|
42968138c8 | ||
|
|
1d23240068 | ||
|
|
d06c5a2a0a | ||
|
|
edc5222fc3 | ||
|
|
9cf298dfc1 | ||
|
|
0d288727b8 | ||
|
|
447afe9cdf | ||
|
|
a175a9dc01 | ||
|
|
53282b5bd0 | ||
|
|
26bda77225 | ||
|
|
c8933bb2d9 | ||
|
|
e08c01aa1a | ||
|
|
84a3a9689d | ||
|
|
f68339639a | ||
|
|
cb60ce59dd | ||
|
|
529a16dec6 | ||
|
|
f1b018740c | ||
|
|
e85123d398 | ||
|
|
06510ccb53 | ||
|
|
3bcbebd440 | ||
|
|
436ce07218 | ||
|
|
ab7bd5ef98 | ||
|
|
47d6853439 | ||
|
|
df9d6effae | ||
|
|
3f20dd7186 | ||
|
|
e13465fb8b | ||
|
|
c603d099aa | ||
|
|
2ba1a14fb0 | ||
|
|
90dcd8c05d | ||
|
|
57272d5ddf | ||
|
|
b006a7a12f | ||
|
|
14eef67eb2 | ||
|
|
296df2b18c | ||
|
|
55f69a11b6 | ||
|
|
47267ba556 | ||
|
|
034aa0c2d7 | ||
|
|
e00b4ff1de | ||
|
|
814a3f4e01 | ||
|
|
2f9397139d | ||
|
|
d6bbcbc4cf | ||
|
|
6f877d9daf | ||
|
|
07681b6b58 | ||
|
|
fdc487d8b3 | ||
|
|
aa05dc8935 | ||
|
|
e4515faf54 | ||
|
|
41789c6c3d | ||
|
|
260c86082d | ||
|
|
d30cbaf5dc | ||
|
|
9beaa85b07 | ||
|
|
e753f249e1 | ||
|
|
2d042274ac | ||
|
|
3bffd2e8e5 | ||
|
|
c3619f5536 | ||
|
|
3b56427a1e | ||
|
|
43489756ad | ||
|
|
a690edab17 | ||
|
|
ad6e62cd82 | ||
|
|
388e3251fa | ||
|
|
f5e2ed0fd8 | ||
|
|
562b998366 | ||
|
|
bb04446285 | ||
|
|
bfd75056b0 | ||
|
|
933841d903 | ||
|
|
6d0aa73981 | ||
|
|
b0b9b8091b | ||
|
|
53c8f700f4 | ||
|
|
901dde0e45 | ||
|
|
e239a4a20f | ||
|
|
fecaed0ed4 | ||
|
|
d86b49ac86 | ||
|
|
45ab8bf60e | ||
|
|
a1359b970c | ||
|
|
28f7ca1f80 | ||
|
|
a368b87791 | ||
|
|
f94f1c6016 | ||
|
|
c589862b78 | ||
|
|
5a49b793d9 | ||
|
|
40acf6b52a | ||
|
|
856a63da4d | ||
|
|
1ef41b8337 | ||
|
|
00e9c4cc96 | ||
|
|
189ff9b664 | ||
|
|
d8923270e6 | ||
|
|
5652f54ac2 | ||
|
|
7e7fc53da5 | ||
|
|
715534800a | ||
|
|
339e556feb | ||
|
|
5c18825a18 | ||
|
|
3e3e145497 | ||
|
|
47975ed53e | ||
|
|
ab05280666 | ||
|
|
b8ff56896c | ||
|
|
9d0029e215 | ||
|
|
83dba0b67b | ||
|
|
e24e19ce3b | ||
|
|
3d47a7f8ab | ||
|
|
9ce36e3e4b | ||
|
|
ba4bce2581 | ||
|
|
c4e9615691 |
@@ -4,8 +4,8 @@ jobs:
|
||||
working_directory: ~/pytorch-transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
resource_class: large
|
||||
parallelism: 4
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off .
|
||||
@@ -17,7 +17,7 @@ jobs:
|
||||
build_py2:
|
||||
working_directory: ~/pytorch-transformers
|
||||
resource_class: large
|
||||
parallelism: 4
|
||||
parallelism: 1
|
||||
docker:
|
||||
- image: circleci/python:2.7
|
||||
steps:
|
||||
@@ -26,9 +26,27 @@ jobs:
|
||||
- run: sudo pip install pytest codecov pytest-cov
|
||||
- run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
|
||||
- run: codecov
|
||||
deploy_doc:
|
||||
working_directory: ~/pytorch-transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
steps:
|
||||
- add_ssh_keys:
|
||||
fingerprints:
|
||||
- "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||
- run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
||||
workflow_filters: &workflow_filters
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
workflows:
|
||||
version: 2
|
||||
build_and_test:
|
||||
jobs:
|
||||
- build_py3
|
||||
- build_py2
|
||||
version: 2
|
||||
build_and_test:
|
||||
jobs:
|
||||
- build_py3
|
||||
- build_py2
|
||||
- deploy_doc: *workflow_filters
|
||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -127,4 +127,7 @@ proc_data
|
||||
|
||||
# examples
|
||||
runs
|
||||
examples/runs
|
||||
examples/runs
|
||||
|
||||
# data
|
||||
data
|
||||
15
README.md
15
README.md
@@ -12,7 +12,9 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
||||
4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du et al.
|
||||
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||
) by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
||||
|
||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
|
||||
|
||||
@@ -76,14 +78,15 @@ import torch
|
||||
from pytorch_transformers import *
|
||||
|
||||
# PyTorch-Transformers has a unified API
|
||||
# for 6 transformer architectures and 27 pretrained weights.
|
||||
# for 7 transformer architectures and 30 pretrained weights.
|
||||
# Model | Tokenizer | Pretrained weights shortcut
|
||||
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
||||
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
|
||||
(GPT2Model, GPT2Tokenizer, 'gpt2'),
|
||||
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
|
||||
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
|
||||
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024')]
|
||||
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
|
||||
(RobertaModel, RobertaTokenizer, 'roberta-base')]
|
||||
|
||||
# Let's encode some text in a sequence of hidden-states using each model:
|
||||
for model_class, tokenizer_class, pretrained_weights in MODELS:
|
||||
@@ -92,7 +95,7 @@ for model_class, tokenizer_class, pretrained_weights in MODELS:
|
||||
model = model_class.from_pretrained(pretrained_weights)
|
||||
|
||||
# Encode text
|
||||
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
|
||||
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)]) # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
|
||||
with torch.no_grad():
|
||||
last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
|
||||
|
||||
@@ -327,7 +330,7 @@ Breaking change in the `from_pretrained()`method:
|
||||
|
||||
1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
|
||||
|
||||
2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuratoin class attributes.
|
||||
2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
|
||||
|
||||
Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
|
||||
|
||||
@@ -392,8 +395,8 @@ for batch in train_data:
|
||||
loss = model(batch)
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
||||
scheduler.step()
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ Examples
|
||||
- How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
|
||||
* - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_
|
||||
- Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
|
||||
* - `Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2 <#fine-tuning>`_
|
||||
- Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py`` and ``run_gpt2.py``
|
||||
* - `Fine-tuning with OpenAI GPT, Transformer-XL, GPT-2 as well as BERT and RoBERTa <#fine-tuning>`_
|
||||
- Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py``, ``run_gpt2.py`` and ``run_lm_finetuning.py``
|
||||
* - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_
|
||||
- How to fine tune ``BERT large``
|
||||
|
||||
@@ -68,7 +68,9 @@ GLUE results on dev set
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We get the following results on the dev set of GLUE benchmark with an uncased BERT base
|
||||
model. All experiments were run on a P100 GPU with a batch size of 32.
|
||||
model (`bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train batch size of 24. Some of
|
||||
these tasks have a small dataset and training can lead to high variance in the results between different runs.
|
||||
We report the median on 5 runs (with different seeds) for each of the metrics.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
@@ -78,31 +80,31 @@ model. All experiments were run on a P100 GPU with a batch size of 32.
|
||||
- Result
|
||||
* - CoLA
|
||||
- Matthew's corr.
|
||||
- 57.29
|
||||
- 55.75
|
||||
* - SST-2
|
||||
- accuracy
|
||||
- 93.00
|
||||
- 92.09
|
||||
* - MRPC
|
||||
- F1/accuracy
|
||||
- 88.85/83.82
|
||||
- 90.48/86.27
|
||||
* - STS-B
|
||||
- Pearson/Spearman corr.
|
||||
- 89.70/89.37
|
||||
- 89.03/88.64
|
||||
* - QQP
|
||||
- accuracy/F1
|
||||
- 90.72/87.41
|
||||
- 90.92/87.72
|
||||
* - MNLI
|
||||
- matched acc./mismatched acc.
|
||||
- 83.95/84.39
|
||||
- 83.74/84.06
|
||||
* - QNLI
|
||||
- accuracy
|
||||
- 89.04
|
||||
- 91.07
|
||||
* - RTE
|
||||
- accuracy
|
||||
- 61.01
|
||||
- 68.59
|
||||
* - WNLI
|
||||
- accuracy
|
||||
- 53.52
|
||||
- 43.66
|
||||
|
||||
|
||||
Some of these results are significantly different from the ones reported on the test set
|
||||
@@ -382,7 +384,7 @@ Training with the previous hyper-parameters on a single GPU gave us the followin
|
||||
LM Fine-tuning
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
The data should be a text file in the same format as `sample_text.txt <./samples/sample_text.txt>`_ (one sentence per line, docs separated by empty line).
|
||||
The data should be a text file in the same format as `sample_text.txt <./pytorch_transformers/tests/fixtures/sample_text.txt/sample_text.txt>`_ (one sentence per line, docs separated by empty line).
|
||||
You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and split into ~500k sentences with spaCy.
|
||||
Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ :
|
||||
|
||||
@@ -393,12 +395,13 @@ Thank to the work of @Rocketknight1 and @tholor there are now **several scripts*
|
||||
OpenAI GPT, Transformer-XL and GPT-2: running the examples
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
We provide three examples of scripts for OpenAI GPT, Transformer-XL and OpenAI GPT-2 based on (and extended from) the respective original implementations:
|
||||
We provide three examples of scripts for OpenAI GPT, Transformer-XL, OpenAI GPT-2, BERT and RoBERTa based on (and extended from) the respective original implementations:
|
||||
|
||||
|
||||
* fine-tuning OpenAI GPT on the ROCStories dataset
|
||||
* evaluating Transformer-XL on Wikitext 103
|
||||
* unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
|
||||
* fine-tuning GPT/GPT-2 on a causal language modeling task and BERT/RoBERTa on a masked language modeling task
|
||||
|
||||
Fine-tuning OpenAI GPT on the RocStories dataset
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@@ -452,7 +455,51 @@ Unconditional generation:
|
||||
|
||||
python run_gpt2.py --unconditional
|
||||
|
||||
The same option as in the original scripts are provided, please refere to the code of the example and the original repository of OpenAI.
|
||||
The same option as in the original scripts are provided, please refer to the code of the example and the original repository of OpenAI.
|
||||
|
||||
|
||||
Causal LM fine-tuning on GPT/GPT-2, Masked LM fine-tuning on BERT/RoBERTa
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Before running the following examples you should download the `WikiText-2 dataset <https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/>`__ and unpack it to some directory `$WIKITEXT_2_DATASET`
|
||||
The following results were obtained using the `raw` WikiText-2 (no tokens were replaced before the tokenization).
|
||||
|
||||
This example fine-tunes GPT-2 on the WikiText-2 dataset. The loss function is a causal language modeling loss (perplexity).
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
||||
export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
|
||||
|
||||
python run_lm_finetuning.py
|
||||
--output_dir=output
|
||||
--model_type=gpt2
|
||||
--model_name_or_path=gpt2
|
||||
--do_train
|
||||
--train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
|
||||
--do_eval
|
||||
--eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
|
||||
|
||||
This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run.
|
||||
It reaches a score of about 20 perplexity once fine-tuned on the dataset.
|
||||
|
||||
This example fine-tunes RoBERTa on the WikiText-2 dataset. The loss function is a masked language modeling loss (masked perplexity).
|
||||
The `--mlm` flag is necessary to fine-tune BERT/RoBERTa on masked language modeling.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
||||
export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
|
||||
|
||||
python run_lm_finetuning.py
|
||||
--output_dir=output
|
||||
--model_type=roberta
|
||||
--model_name_or_path=roberta-base
|
||||
--do_train
|
||||
--train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
|
||||
--do_eval
|
||||
--eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
|
||||
--mlm
|
||||
|
||||
.. _fine-tuning-BERT-large:
|
||||
|
||||
|
||||
@@ -11,6 +11,8 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
||||
4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
||||
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
@@ -48,3 +50,4 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
||||
model_doc/xlm
|
||||
model_doc/xlnet
|
||||
model_doc/roberta
|
||||
model_doc/distilbert
|
||||
|
||||
@@ -52,6 +52,12 @@ If you want to reproduce the original tokenization process of the ``OpenAI GPT``
|
||||
If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
|
||||
|
||||
|
||||
Note on model downloads (Continuous Integration or large-scale deployments)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
|
||||
|
||||
|
||||
Do you want to run a Transformer model on a mobile device?
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
||||
43
docs/source/model_doc/distilbert.rst
Normal file
43
docs/source/model_doc/distilbert.rst
Normal file
@@ -0,0 +1,43 @@
|
||||
DistilBERT
|
||||
----------------------------------------------------
|
||||
|
||||
``DistilBertConfig``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: pytorch_transformers.DistilBertConfig
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertTokenizer``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: pytorch_transformers.DistilBertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertModel``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: pytorch_transformers.DistilBertModel
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertForMaskedLM``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertForSequenceClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertForQuestionAnswering``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
|
||||
:members:
|
||||
@@ -62,6 +62,9 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``gpt2-medium`` | | 24-layer, 1024-hidden, 16-heads, 345M parameters. |
|
||||
| | | | OpenAI's Medium-sized GPT-2 English model |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``gpt2-large`` | | 36-layer, 1280-hidden, 20-heads, 774M parameters. |
|
||||
| | | | OpenAI's Large-sized GPT-2 English model |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| Transformer-XL | ``transfo-xl-wt103`` | | 18-layer, 1024-hidden, 16-heads, 257M parameters. |
|
||||
| | | | English model trained on wikitext-103 |
|
||||
@@ -72,16 +75,16 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``xlnet-large-cased`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. |
|
||||
| | | | XLNet Large English model |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| XLM | ``xlm-mlm-en-2048`` | | 12-layer, 1024-hidden, 8-heads |
|
||||
| XLM | ``xlm-mlm-en-2048`` | | 12-layer, 2048-hidden, 16-heads |
|
||||
| | | | XLM English model |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``xlm-mlm-ende-1024`` | | 12-layer, 1024-hidden, 8-heads |
|
||||
| | ``xlm-mlm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads |
|
||||
| | | | XLM English-German Multi-language model |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``xlm-mlm-enfr-1024`` | | 12-layer, 1024-hidden, 8-heads |
|
||||
| | ``xlm-mlm-enfr-1024`` | | 6-layer, 1024-hidden, 8-heads |
|
||||
| | | | XLM English-French Multi-language model |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``xlm-mlm-enro-1024`` | | 12-layer, 1024-hidden, 8-heads |
|
||||
| | ``xlm-mlm-enro-1024`` | | 6-layer, 1024-hidden, 8-heads |
|
||||
| | | | XLM English-Romanian Multi-language model |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``xlm-mlm-xnli15-1024`` | | 12-layer, 1024-hidden, 8-heads |
|
||||
@@ -93,7 +96,7 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``xlm-clm-enfr-1024`` | | 12-layer, 1024-hidden, 8-heads |
|
||||
| | | | XLM English model trained with CLM (Causal Language Modeling) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``xlm-clm-ende-1024`` | | 12-layer, 1024-hidden, 8-heads |
|
||||
| | ``xlm-clm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads |
|
||||
| | | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| RoBERTa | ``roberta-base`` | | 12-layer, 768-hidden, 12-heads, 125M parameters |
|
||||
@@ -108,5 +111,13 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__. |
|
||||
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| DistilBERT | ``distilbert-base-uncased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint |
|
||||
| | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``distilbert-base-uncased-distilled-squad`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. |
|
||||
| | | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
.. <https://huggingface.co/pytorch-transformers/examples.html>`__
|
||||
100
examples/distillation/README.md
Normal file
100
examples/distillation/README.md
Normal file
@@ -0,0 +1,100 @@
|
||||
# DistilBERT
|
||||
|
||||
This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
|
||||
|
||||
## What is DistilBERT
|
||||
|
||||
DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||
|
||||
For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
|
||||
).
|
||||
|
||||
## How to use DistilBERT
|
||||
|
||||
PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
||||
|
||||
- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
|
||||
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
|
||||
|
||||
Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
|
||||
|
||||
```python
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
|
||||
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
```
|
||||
|
||||
## How to train DistilBERT
|
||||
|
||||
In the following, we will explain how you can train your own compressed model.
|
||||
|
||||
### A. Preparing the data
|
||||
|
||||
The weights we release are trained using a concatenation of Toronto Book Corpus and English Wikipedia (same training data as the English version of BERT).
|
||||
|
||||
To avoid processing the data several time, we do it once and for all before the training. From now on, will suppose that you have a text file `dump.txt` which contains one sequence per line (a sequence being composed of one of several coherent sentences).
|
||||
|
||||
First, we will binarize the data, i.e. tokenize the data and convert each token in an index in our model's vocabulary.
|
||||
|
||||
```bash
|
||||
python scripts/binarized_data.py \
|
||||
--file_path data/dump.txt \
|
||||
--bert_tokenizer bert-base-uncased \
|
||||
--dump_file data/binarized_text
|
||||
```
|
||||
|
||||
Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurences of each tokens in the data:
|
||||
|
||||
```bash
|
||||
python scripts/token_counts.py \
|
||||
--data_file data/binarized_text.bert-base-uncased.pickle \
|
||||
--token_counts_dump data/token_counts.bert-base-uncased.pickle
|
||||
```
|
||||
|
||||
### B. Training
|
||||
|
||||
Training with distillation is really simple once you have pre-processed the data:
|
||||
|
||||
```bash
|
||||
python train.py \
|
||||
--dump_path serialization_dir/my_first_training \
|
||||
--data_file data/binarized_text.bert-base-uncased.pickle \
|
||||
--token_counts data/token_counts.bert-base-uncased.pickle \
|
||||
--force # overwrites the `dump_path` if it already exists.
|
||||
```
|
||||
|
||||
By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
|
||||
|
||||
We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
|
||||
|
||||
```bash
|
||||
export NODE_RANK=0
|
||||
export N_NODES=1
|
||||
|
||||
export N_GPU_NODE=4
|
||||
export WORLD_SIZE=4
|
||||
export MASTER_PORT=<AN_OPEN_PORT>
|
||||
export MASTER_ADDR=<I.P.>
|
||||
|
||||
pkill -f 'python -u train.py'
|
||||
|
||||
python -m torch.distributed.launch \
|
||||
--nproc_per_node=$N_GPU_NODE \
|
||||
--nnodes=$N_NODES \
|
||||
--node_rank $NODE_RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
train.py \
|
||||
--force \
|
||||
--n_gpu $WORLD_SIZE \
|
||||
--data_file data/binarized_text.bert-base-uncased.pickle \
|
||||
--token_counts data/token_counts.bert-base-uncased.pickle \
|
||||
--dump_path serialization_dir/my_first_distillation
|
||||
```
|
||||
|
||||
**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
|
||||
|
||||
Happy distillation!
|
||||
201
examples/distillation/dataset.py
Normal file
201
examples/distillation/dataset.py
Normal file
@@ -0,0 +1,201 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Dataloaders to train DistilBERT
|
||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
"""
|
||||
from typing import List
|
||||
import math
|
||||
from itertools import chain
|
||||
from collections import Counter
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from utils import logger
|
||||
|
||||
class Dataset:
|
||||
def __init__(self,
|
||||
params,
|
||||
data):
|
||||
self.params = params
|
||||
self.tokens_per_batch = params.tokens_per_batch
|
||||
self.batch_size = params.batch_size
|
||||
self.shuffle = params.shuffle
|
||||
self.group_by_size = params.group_by_size
|
||||
|
||||
self.token_ids = np.array(data)
|
||||
self.lengths = np.uint16([len(t) for t in data])
|
||||
|
||||
self.check()
|
||||
self.remove_long_sequences()
|
||||
self.remove_empty_sequences()
|
||||
self.check()
|
||||
self.print_statistics()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.lengths)
|
||||
|
||||
def check(self):
|
||||
"""
|
||||
Some sanity checks
|
||||
"""
|
||||
assert len(self.token_ids) == len(self.lengths)
|
||||
|
||||
def remove_long_sequences(self):
|
||||
"""
|
||||
Sequences that are too long are splitted by chunk of max_position_embeddings.
|
||||
"""
|
||||
indices = self.lengths >= self.params.max_position_embeddings
|
||||
logger.info(f'Splitting {sum(indices)} too long sequences.')
|
||||
|
||||
def divide_chunks(l, n):
|
||||
return [l[i:i + n] for i in range(0, len(l), n)]
|
||||
|
||||
new_tok_ids = []
|
||||
new_lengths = []
|
||||
cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
|
||||
max_len = self.params.max_position_embeddings
|
||||
|
||||
for seq_, len_ in zip(self.token_ids, self.lengths):
|
||||
if len_ <= max_len:
|
||||
new_tok_ids.append(seq_)
|
||||
new_lengths.append(len_)
|
||||
else:
|
||||
sub_seqs = []
|
||||
for sub_s in divide_chunks(seq_, max_len-2):
|
||||
if sub_s[0] != cls_id:
|
||||
sub_s = np.insert(sub_s, 0, cls_id)
|
||||
if sub_s[-1] != sep_id:
|
||||
sub_s = np.insert(sub_s, len(sub_s), cls_id)
|
||||
assert len(sub_s) <= max_len
|
||||
sub_seqs.append(sub_s)
|
||||
|
||||
new_tok_ids.extend(sub_seqs)
|
||||
new_lengths.extend([len(l) for l in sub_seqs])
|
||||
|
||||
self.token_ids = np.array(new_tok_ids)
|
||||
self.lengths = np.array(new_lengths)
|
||||
|
||||
def remove_empty_sequences(self):
|
||||
"""
|
||||
Too short sequences are simply removed. This could be tunedd.
|
||||
"""
|
||||
init_size = len(self)
|
||||
indices = self.lengths > 5
|
||||
self.token_ids = self.token_ids[indices]
|
||||
self.lengths = self.lengths[indices]
|
||||
new_size = len(self)
|
||||
logger.info(f'Remove {init_size - new_size} too short (<=5 tokens) sequences.')
|
||||
|
||||
def print_statistics(self):
|
||||
"""
|
||||
Print some statistics on the corpus. Only the master process.
|
||||
"""
|
||||
if not self.params.is_master:
|
||||
return
|
||||
logger.info(f'{len(self)} sequences')
|
||||
# data_len = sum(self.lengths)
|
||||
# nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
|
||||
# logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
|
||||
|
||||
# unk_idx = self.params.special_tok_ids['unk_token']
|
||||
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
||||
|
||||
def select_data(self, a: int, b: int):
|
||||
"""
|
||||
Select a subportion of the data.
|
||||
"""
|
||||
n_sequences = len(self)
|
||||
assert 0 <= a < b <= n_sequences, ValueError(f'`0 <= a < b <= n_sequences` is not met with a={a} and b={b}')
|
||||
|
||||
logger.info(f'Selecting sequences from {a} to {b} (excluded).')
|
||||
self.token_ids = self.token_ids[a:b]
|
||||
self.lengths = self.lengths[a:b]
|
||||
|
||||
self.check()
|
||||
|
||||
def split(self):
|
||||
"""
|
||||
Distributed training: split the data accross the processes.
|
||||
"""
|
||||
assert self.params.n_gpu > 1
|
||||
logger.info('Splitting the data accross the processuses.')
|
||||
n_seq = len(self)
|
||||
n_seq_per_procesus = n_seq // self.params.world_size
|
||||
a = n_seq_per_procesus * self.params.global_rank
|
||||
b = a + n_seq_per_procesus
|
||||
self.select_data(a=a, b=b)
|
||||
|
||||
def batch_sequences(self,
|
||||
token_ids: List[List[int]],
|
||||
lengths: List[int]):
|
||||
"""
|
||||
Do the padding and transform into torch.tensor.
|
||||
"""
|
||||
assert len(token_ids) == len(lengths)
|
||||
|
||||
# Max for paddings
|
||||
max_seq_len_ = max(lengths)
|
||||
|
||||
# Pad token ids
|
||||
pad_idx = self.params.special_tok_ids['pad_token']
|
||||
tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
|
||||
assert len(tk_) == len(token_ids)
|
||||
assert all(len(t) == max_seq_len_ for t in tk_)
|
||||
|
||||
tk_t = torch.tensor(tk_) # (bs, max_seq_len_)
|
||||
lg_t = torch.tensor(lengths.astype(int)) # (bs)
|
||||
return tk_t, lg_t
|
||||
|
||||
def get_batches_iterator(self,
|
||||
batches):
|
||||
"""
|
||||
Return an iterator over batches.
|
||||
"""
|
||||
for sequences_ids in batches:
|
||||
token_ids, lengths = self.batch_sequences(self.token_ids[sequences_ids],
|
||||
self.lengths[sequences_ids])
|
||||
yield (token_ids, lengths)
|
||||
|
||||
def get_iterator(self,
|
||||
seed: int = None):
|
||||
"""
|
||||
Return a data iterator.
|
||||
"""
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
n_sequences = len(self)
|
||||
indices = np.arange(n_sequences)
|
||||
|
||||
if self.group_by_size:
|
||||
indices = indices[np.argsort(self.lengths[indices], kind='mergesort')]
|
||||
|
||||
if self.tokens_per_batch == -1:
|
||||
batches = np.array_split(indices, math.ceil(len(indices) * 1. / self.batch_size))
|
||||
else:
|
||||
assert self.tokens_per_batch > 0
|
||||
batch_ids = np.cumsum(self.lengths[indices]) // self.tokens_per_batch
|
||||
_, bounds = np.unique(batch_ids, return_index=True)
|
||||
batches = [indices[bounds[i]:bounds[i + 1]] for i in range(len(bounds) - 1)]
|
||||
if bounds[-1] < len(indices):
|
||||
batches.append(indices[bounds[-1]:])
|
||||
|
||||
if self.shuffle:
|
||||
rng.shuffle(batches)
|
||||
|
||||
assert n_sequences == sum([len(x) for x in batches])
|
||||
assert self.lengths[indices].sum() == sum([self.lengths[x].sum() for x in batches])
|
||||
|
||||
return self.get_batches_iterator(batches=batches)
|
||||
448
examples/distillation/distiller.py
Normal file
448
examples/distillation/distiller.py
Normal file
@@ -0,0 +1,448 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" The distiller to distil DistilBERT
|
||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
"""
|
||||
import os
|
||||
import math
|
||||
from tensorboardX import SummaryWriter
|
||||
from tqdm import trange, tqdm
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from pytorch_transformers import AdamW, WarmupLinearSchedule
|
||||
|
||||
from utils import logger
|
||||
from dataset import Dataset
|
||||
|
||||
class Distiller:
|
||||
def __init__(self,
|
||||
params: dict,
|
||||
dataloader: Dataset,
|
||||
token_probs: torch.tensor,
|
||||
student: nn.Module,
|
||||
teacher: nn.Module):
|
||||
logger.info('Initializing Distiller')
|
||||
self.params = params
|
||||
self.dump_path = params.dump_path
|
||||
self.multi_gpu = params.multi_gpu
|
||||
self.fp16 = params.fp16
|
||||
|
||||
self.student = student
|
||||
self.teacher = teacher
|
||||
|
||||
self.dataloader = dataloader
|
||||
if self.params.n_gpu > 1:
|
||||
self.dataloader.split()
|
||||
self.get_iterator(seed=params.seed)
|
||||
|
||||
self.temperature = params.temperature
|
||||
assert self.temperature > 0.
|
||||
|
||||
self.alpha_ce = params.alpha_ce
|
||||
self.alpha_mlm = params.alpha_mlm
|
||||
self.alpha_mse = params.alpha_mse
|
||||
assert self.alpha_ce >= 0.
|
||||
assert self.alpha_mlm >= 0.
|
||||
assert self.alpha_mse >= 0.
|
||||
assert self.alpha_ce + self.alpha_mlm + self.alpha_mse > 0.
|
||||
|
||||
self.mlm_mask_prop = params.mlm_mask_prop
|
||||
assert 0.0 <= self.mlm_mask_prop <= 1.0
|
||||
assert params.word_mask + params.word_keep + params.word_rand == 1.0
|
||||
self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
|
||||
self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
|
||||
self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
|
||||
if self.fp16:
|
||||
self.pred_probs = self.pred_probs.half()
|
||||
self.token_probs = self.token_probs.half()
|
||||
|
||||
self.epoch = 0
|
||||
self.n_iter = 0
|
||||
self.n_total_iter = 0
|
||||
self.n_sequences_epoch = 0
|
||||
self.total_loss_epoch = 0
|
||||
self.last_loss = 0
|
||||
self.last_loss_ce = 0
|
||||
self.last_loss_mlm = 0
|
||||
self.last_loss_mse = 0
|
||||
|
||||
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
|
||||
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
||||
self.mse_loss_fct = nn.MSELoss(reduction='sum')
|
||||
|
||||
logger.info('--- Initializing model optimizer')
|
||||
assert params.gradient_accumulation_steps >= 1
|
||||
self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
|
||||
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
|
||||
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
||||
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay},
|
||||
{'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
|
||||
]
|
||||
logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]))
|
||||
logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
|
||||
self.optimizer = AdamW(optimizer_grouped_parameters,
|
||||
lr=params.learning_rate,
|
||||
eps=params.adam_epsilon,
|
||||
betas=(0.9, 0.98))
|
||||
self.scheduler = WarmupLinearSchedule(self.optimizer,
|
||||
warmup_steps=warmup_steps,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
if self.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
|
||||
self.student, self.optimizer = amp.initialize(self.student,
|
||||
self.optimizer,
|
||||
opt_level=self.params.fp16_opt_level)
|
||||
self.teacher = self.teacher.half()
|
||||
|
||||
if self.multi_gpu:
|
||||
if self.fp16:
|
||||
from apex.parallel import DistributedDataParallel
|
||||
logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
|
||||
self.student = DistributedDataParallel(self.student)
|
||||
else:
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
|
||||
self.student = DistributedDataParallel(self.student,
|
||||
device_ids=[params.local_rank],
|
||||
output_device=params.local_rank)
|
||||
|
||||
self.is_master = params.is_master
|
||||
if self.is_master:
|
||||
logger.info('--- Initializing Tensorboard')
|
||||
self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
|
||||
self.tensorboard.add_text(tag='config', text_string=str(self.params), global_step=0)
|
||||
|
||||
def get_iterator(self,
|
||||
seed: int = None):
|
||||
"""
|
||||
Initialize the data iterator.
|
||||
Each process has its own data iterator (iterating on his own random portion of the dataset).
|
||||
|
||||
Input:
|
||||
------
|
||||
seed: `int` - The random seed.
|
||||
"""
|
||||
logger.info('--- Initializing Data Iterator')
|
||||
self.data_iterator = self.dataloader.get_iterator(seed=seed)
|
||||
|
||||
def get_batch(self):
|
||||
"""
|
||||
Call the data iterator to output a new batch.
|
||||
If the data iterator went through the whole dataset, create a new iterator.
|
||||
"""
|
||||
assert hasattr(self, 'data_iterator')
|
||||
try:
|
||||
x = next(self.data_iterator)
|
||||
except StopIteration:
|
||||
logger.warning('--- Went through the whole dataset. Creating new data iterator.')
|
||||
self.data_iterator = self.dataloader.get_iterator()
|
||||
x = next(self.data_iterator)
|
||||
return x
|
||||
|
||||
def prepare_batch(self,
|
||||
batch):
|
||||
"""
|
||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
|
||||
|
||||
Input:
|
||||
------
|
||||
batch: `Tuple`
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
|
||||
lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
|
||||
|
||||
Output:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
assert token_ids.size(0) == lengths.size(0)
|
||||
|
||||
attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
|
||||
|
||||
bs, max_seq_len = token_ids.size()
|
||||
mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
|
||||
|
||||
x_prob = self.token_probs[token_ids.flatten()]
|
||||
n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
|
||||
tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
|
||||
pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.uint8, device=token_ids.device)
|
||||
pred_mask[tgt_ids] = 1
|
||||
pred_mask = pred_mask.view(bs, max_seq_len)
|
||||
|
||||
pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0
|
||||
|
||||
# mask a number of words == 0 [8] (faster with fp16)
|
||||
if self.fp16:
|
||||
n1 = pred_mask.sum().item()
|
||||
if n1 > 8:
|
||||
pred_mask = pred_mask.view(-1)
|
||||
n2 = max(n1 % 8, 8 * (n1 // 8))
|
||||
if n2 != n1:
|
||||
pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0
|
||||
pred_mask = pred_mask.view(bs, max_seq_len)
|
||||
assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
|
||||
|
||||
_token_ids_real = token_ids[pred_mask]
|
||||
_token_ids_rand = _token_ids_real.clone().random_(self.params.vocab_size)
|
||||
_token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
|
||||
probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
|
||||
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
|
||||
token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
|
||||
|
||||
mlm_labels[1-pred_mask] = -1
|
||||
|
||||
return token_ids, attn_mask, mlm_labels
|
||||
|
||||
def round_batch(self,
|
||||
x: torch.tensor,
|
||||
lengths: torch.tensor):
|
||||
"""
|
||||
For float16 only.
|
||||
Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
|
||||
|
||||
Input:
|
||||
------
|
||||
x: `torch.tensor(bs, seq_length)` - The token ids.
|
||||
lengths: `torch.tensor(bs, seq_length)` - The lengths of each of the sequence in the batch.
|
||||
|
||||
Output:
|
||||
-------
|
||||
x: `torch.tensor(new_bs, new_seq_length)` - The updated token ids.
|
||||
lengths: `torch.tensor(new_bs, new_seq_length)` - The updated lengths.
|
||||
"""
|
||||
if not self.fp16 or len(lengths) < 8:
|
||||
return x, lengths
|
||||
|
||||
# number of sentences == 0 [8]
|
||||
bs1 = len(lengths)
|
||||
bs2 = 8 * (bs1 // 8)
|
||||
assert bs2 > 0 and bs2 % 8 == 0
|
||||
if bs1 != bs2:
|
||||
idx = torch.randperm(bs1)[:bs2]
|
||||
lengths = lengths[idx]
|
||||
slen = lengths.max().item()
|
||||
x = x[idx, :slen]
|
||||
else:
|
||||
idx = None
|
||||
|
||||
# sequence length == 0 [8]
|
||||
ml1 = x.size(1)
|
||||
if ml1 % 8 != 0:
|
||||
pad = 8 - (ml1 % 8)
|
||||
ml2 = ml1 + pad
|
||||
pad_id = self.params.special_tok_ids['pad_token']
|
||||
padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
|
||||
x = torch.cat([x, padding_tensor], 1)
|
||||
assert x.size() == (bs2, ml2)
|
||||
|
||||
assert x.size(0) % 8 == 0
|
||||
assert x.size(1) % 8 == 0
|
||||
return x, lengths
|
||||
|
||||
def train(self):
|
||||
"""
|
||||
The real training loop.
|
||||
"""
|
||||
if self.is_master: logger.info('Starting training')
|
||||
self.student.train()
|
||||
self.teacher.eval()
|
||||
|
||||
for _ in range(self.params.n_epoch):
|
||||
if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
|
||||
|
||||
iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
|
||||
for __ in range(self.num_steps_epoch):
|
||||
batch = self.get_batch()
|
||||
if self.params.n_gpu > 0:
|
||||
batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
|
||||
token_ids, attn_mask, mlm_labels = self.prepare_batch(batch=batch)
|
||||
|
||||
self.step(input_ids=token_ids, attention_mask=attn_mask, mlm_labels=mlm_labels)
|
||||
|
||||
iter_bar.update()
|
||||
iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
|
||||
'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'})
|
||||
iter_bar.close()
|
||||
|
||||
if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
|
||||
self.end_epoch()
|
||||
|
||||
if self.is_master: logger.info('Training is finished')
|
||||
|
||||
def step(self,
|
||||
input_ids: torch.tensor,
|
||||
attention_mask: torch.tensor,
|
||||
mlm_labels: torch.tensor):
|
||||
"""
|
||||
One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
|
||||
and possibly a parameter update (depending on the gradient accumulation).
|
||||
|
||||
Input:
|
||||
------
|
||||
input_ids: `torch.tensor(bs, seq_length)` - The token ids.
|
||||
attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
|
||||
"""
|
||||
s_logits = self.student(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
|
||||
with torch.no_grad():
|
||||
t_logits = self.teacher(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
|
||||
assert s_logits.size() == t_logits.size()
|
||||
|
||||
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||
#https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
||||
if self.params.restrict_ce_to_mask:
|
||||
mask = (mlm_labels>-1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
else:
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
assert t_logits_slct.size() == s_logits_slct.size()
|
||||
|
||||
loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
|
||||
F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
|
||||
loss = self.alpha_ce*loss_ce
|
||||
if self.alpha_mlm > 0.:
|
||||
loss_mlm = self.mlm_loss_fct(s_logits.view(-1, s_logits.size(-1)), mlm_labels.view(-1))
|
||||
loss += self.alpha_mlm * loss_mlm
|
||||
if self.alpha_mse > 0.:
|
||||
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
|
||||
loss += self.alpha_mse * loss_mse
|
||||
|
||||
self.total_loss_epoch += loss.item()
|
||||
self.last_loss = loss.item()
|
||||
self.last_loss_ce = loss_ce.item()
|
||||
if self.alpha_mlm > 0.:
|
||||
self.last_loss_mlm = loss_mlm.item()
|
||||
if self.alpha_mse > 0.:
|
||||
self.last_loss_mse = loss_mse.item()
|
||||
|
||||
self.optimize(loss)
|
||||
|
||||
self.n_sequences_epoch += input_ids.size(0)
|
||||
|
||||
def optimize(self,
|
||||
loss):
|
||||
"""
|
||||
Normalization on the loss (gradient accumulation or distributed training), followed by
|
||||
backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
|
||||
Also update the metrics for tensorboard.
|
||||
"""
|
||||
# Check for NaN
|
||||
if (loss != loss).data.any():
|
||||
logger.error('NaN detected')
|
||||
exit()
|
||||
|
||||
if self.multi_gpu:
|
||||
loss = loss.mean()
|
||||
if self.params.gradient_accumulation_steps > 1:
|
||||
loss = loss / self.params.gradient_accumulation_steps
|
||||
|
||||
if self.fp16:
|
||||
from apex import amp
|
||||
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
self.iter()
|
||||
if self.n_iter % self.params.gradient_accumulation_steps == 0:
|
||||
if self.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
|
||||
self.scheduler.step()
|
||||
self.optimizer.step()
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
def iter(self):
|
||||
"""
|
||||
Update global counts, write to tensorboard and save checkpoint.
|
||||
"""
|
||||
self.n_iter += 1
|
||||
self.n_total_iter += 1
|
||||
|
||||
if self.n_total_iter % self.params.log_interval == 0:
|
||||
self.log_tensorboard()
|
||||
if self.n_total_iter % self.params.checkpoint_interval == 0:
|
||||
self.save_checkpoint()
|
||||
|
||||
def log_tensorboard(self):
|
||||
"""
|
||||
Log into tensorboard. Only by the master process.
|
||||
"""
|
||||
if not self.is_master:
|
||||
return
|
||||
|
||||
for param_name, param in self.student.named_parameters():
|
||||
self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter)
|
||||
if param.grad is None:
|
||||
continue
|
||||
self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter)
|
||||
|
||||
self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
|
||||
if self.alpha_mlm > 0.:
|
||||
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
|
||||
if self.alpha_mse > 0.:
|
||||
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
|
||||
|
||||
def end_epoch(self):
|
||||
"""
|
||||
Finally arrived at the end of epoch (full pass on dataset).
|
||||
Do some tensorboard logging and checkpoint saving.
|
||||
"""
|
||||
logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.')
|
||||
|
||||
if self.is_master:
|
||||
self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth')
|
||||
self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch)
|
||||
|
||||
self.epoch += 1
|
||||
self.n_sequences_epoch = 0
|
||||
self.n_iter = 0
|
||||
self.total_loss_epoch = 0
|
||||
|
||||
def save_checkpoint(self,
|
||||
checkpoint_name: str = 'checkpoint.pth'):
|
||||
"""
|
||||
Save the current state. Only by the master process.
|
||||
"""
|
||||
if not self.is_master:
|
||||
return
|
||||
mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student
|
||||
mdl_to_save.config.save_pretrained(self.dump_path)
|
||||
state_dict = mdl_to_save.state_dict()
|
||||
torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
|
||||
1
examples/distillation/requirements.txt
Normal file
1
examples/distillation/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
gitpython==3.0.2
|
||||
77
examples/distillation/scripts/binarized_data.py
Normal file
77
examples/distillation/scripts/binarized_data.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Preprocessing script before training DistilBERT.
|
||||
"""
|
||||
import argparse
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import numpy as np
|
||||
from pytorch_transformers import BertTokenizer
|
||||
|
||||
from examples.distillation.utils import logger
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
||||
parser.add_argument('--file_path', type=str, default='data/dump.txt',
|
||||
help='The path to the data.')
|
||||
parser.add_argument('--bert_tokenizer', type=str, default='bert-base-uncased',
|
||||
help="The tokenizer to use.")
|
||||
parser.add_argument('--dump_file', type=str, default='data/dump',
|
||||
help='The dump file prefix.')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
logger.info(f'Loading Tokenizer ({args.bert_tokenizer})')
|
||||
bert_tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer)
|
||||
|
||||
|
||||
logger.info(f'Loading text from {args.file_path}')
|
||||
with open(args.file_path, 'r', encoding='utf8') as fp:
|
||||
data = fp.readlines()
|
||||
|
||||
|
||||
logger.info(f'Start encoding')
|
||||
logger.info(f'{len(data)} examples to process.')
|
||||
|
||||
rslt = []
|
||||
iter = 0
|
||||
interval = 10000
|
||||
start = time.time()
|
||||
for text in data:
|
||||
text = f'[CLS] {text.strip()} [SEP]'
|
||||
token_ids = bert_tokenizer.encode(text)
|
||||
rslt.append(token_ids)
|
||||
|
||||
iter += 1
|
||||
if iter % interval == 0:
|
||||
end = time.time()
|
||||
logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
|
||||
start = time.time()
|
||||
logger.info('Finished binarization')
|
||||
logger.info(f'{len(data)} examples processed.')
|
||||
|
||||
|
||||
dp_file = f'{args.dump_file}.{args.bert_tokenizer}.pickle'
|
||||
rslt_ = [np.uint16(d) for d in rslt]
|
||||
random.shuffle(rslt_)
|
||||
logger.info(f'Dump to {dp_file}')
|
||||
with open(dp_file, 'wb') as handle:
|
||||
pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
76
examples/distillation/scripts/extract_for_distil.py
Normal file
76
examples/distillation/scripts/extract_for_distil.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Preprocessing script before training DistilBERT.
|
||||
"""
|
||||
from pytorch_transformers import BertForPreTraining
|
||||
import torch
|
||||
import argparse
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForPreTraining for Transfer Learned Distillation")
|
||||
parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
|
||||
parser.add_argument("--dump_checkpoint", default='serialization_dir/transfer_learning_checkpoint_0247911.pth', type=str)
|
||||
parser.add_argument("--vocab_transform", action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
model = BertForPreTraining.from_pretrained(args.bert_model)
|
||||
|
||||
state_dict = model.state_dict()
|
||||
compressed_sd = {}
|
||||
|
||||
for w in ['word_embeddings', 'position_embeddings']:
|
||||
compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
|
||||
state_dict[f'bert.embeddings.{w}.weight']
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
|
||||
state_dict[f'bert.embeddings.LayerNorm.{w}']
|
||||
|
||||
std_idx = 0
|
||||
for teacher_idx in [0, 2, 4, 7, 9, 11]:
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.query.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.key.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.value.{w}']
|
||||
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
|
||||
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.output.dense.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
|
||||
state_dict[f'bert.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
|
||||
std_idx += 1
|
||||
|
||||
compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
|
||||
compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
|
||||
if args.vocab_transform:
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
|
||||
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
|
||||
|
||||
print(f'N layers selected for distillation: {std_idx}')
|
||||
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
||||
|
||||
print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
|
||||
torch.save(compressed_sd, args.dump_checkpoint)
|
||||
47
examples/distillation/scripts/token_counts.py
Normal file
47
examples/distillation/scripts/token_counts.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Preprocessing script before training DistilBERT.
|
||||
"""
|
||||
from collections import Counter
|
||||
import argparse
|
||||
import pickle
|
||||
|
||||
from examples.distillation.utils import logger
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
|
||||
parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
|
||||
help="The binarized dataset.")
|
||||
parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
|
||||
help="The dump file.")
|
||||
parser.add_argument("--vocab_size", default=30522, type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info(f'Loading data from {args.data_file}')
|
||||
with open(args.data_file, 'rb') as fp:
|
||||
data = pickle.load(fp)
|
||||
|
||||
logger.info('Counting occurences for MLM.')
|
||||
counter = Counter()
|
||||
for tk_ids in data:
|
||||
counter.update(tk_ids)
|
||||
counts = [0]*args.vocab_size
|
||||
for k, v in counter.items():
|
||||
counts[k] = v
|
||||
|
||||
logger.info(f'Dump to {args.token_counts_dump}')
|
||||
with open(args.token_counts_dump, 'wb') as handle:
|
||||
pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
236
examples/distillation/train.py
Normal file
236
examples/distillation/train.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Training DistilBERT.
|
||||
"""
|
||||
import os
|
||||
import argparse
|
||||
import pickle
|
||||
import json
|
||||
import shutil
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from pytorch_transformers import BertTokenizer, BertForMaskedLM
|
||||
from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
|
||||
|
||||
from distiller import Distiller
|
||||
from utils import git_log, logger, init_gpu_params, set_seed
|
||||
from dataset import Dataset
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Training")
|
||||
|
||||
parser.add_argument("--dump_path", type=str, required=True,
|
||||
help="The output directory (log, checkpoints, parameters, etc.)")
|
||||
parser.add_argument("--data_file", type=str, required=True,
|
||||
help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
|
||||
parser.add_argument("--token_counts", type=str, required=True,
|
||||
help="The token counts in the data_file for MLM.")
|
||||
parser.add_argument("--force", action='store_true',
|
||||
help="Overwrite dump_path if it already exists.")
|
||||
|
||||
parser.add_argument("--vocab_size", default=30522, type=int,
|
||||
help="The vocabulary size.")
|
||||
parser.add_argument("--max_position_embeddings", default=512, type=int,
|
||||
help="Maximum sequence length we can model (including [CLS] and [SEP]).")
|
||||
parser.add_argument("--sinusoidal_pos_embds", action='store_false',
|
||||
help="If true, the position embeddings are simply fixed with sinusoidal embeddings.")
|
||||
parser.add_argument("--n_layers", default=6, type=int,
|
||||
help="Number of Transformer blocks.")
|
||||
parser.add_argument("--n_heads", default=12, type=int,
|
||||
help="Number of heads in the self-attention module.")
|
||||
parser.add_argument("--dim", default=768, type=int,
|
||||
help="Dimension through the network. Must be divisible by n_heads")
|
||||
parser.add_argument("--hidden_dim", default=3072, type=int,
|
||||
help="Intermediate dimension in the FFN.")
|
||||
parser.add_argument("--dropout", default=0.1, type=float,
|
||||
help="Dropout.")
|
||||
parser.add_argument("--attention_dropout", default=0.1, type=float,
|
||||
help="Dropout in self-attention.")
|
||||
parser.add_argument("--activation", default='gelu', type=str,
|
||||
help="Activation to use in self-attention")
|
||||
parser.add_argument("--tie_weights_", action='store_false',
|
||||
help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.")
|
||||
|
||||
parser.add_argument("--from_pretrained_weights", default=None, type=str,
|
||||
help="Load student initialization checkpoint.")
|
||||
parser.add_argument("--from_pretrained_config", default=None, type=str,
|
||||
help="Load student initialization architecture config.")
|
||||
parser.add_argument("--bert_model", default='bert-base-uncased', type=str,
|
||||
help="The teacher BERT model.")
|
||||
|
||||
parser.add_argument("--temperature", default=2., type=float,
|
||||
help="Temperature for the softmax temperature.")
|
||||
parser.add_argument("--alpha_ce", default=0.5, type=float,
|
||||
help="Linear weight for the distillation loss. Must be >=0.")
|
||||
parser.add_argument("--alpha_mlm", default=0.5, type=float,
|
||||
help="Linear weight for the MLM loss. Must be >=0.")
|
||||
parser.add_argument("--alpha_mse", default=0.0, type=float,
|
||||
help="Linear weight of the MSE loss. Must be >=0.")
|
||||
parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
|
||||
help="Proportion of tokens for which we need to make a prediction.")
|
||||
parser.add_argument("--word_mask", default=0.8, type=float,
|
||||
help="Proportion of tokens to mask out.")
|
||||
parser.add_argument("--word_keep", default=0.1, type=float,
|
||||
help="Proportion of tokens to keep.")
|
||||
parser.add_argument("--word_rand", default=0.1, type=float,
|
||||
help="Proportion of tokens to randomly replace.")
|
||||
parser.add_argument("--mlm_smoothing", default=0.7, type=float,
|
||||
help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
|
||||
parser.add_argument("--restrict_ce_to_mask", action='store_true',
|
||||
help="If true, compute the distilation loss only the [MLM] prediction distribution.")
|
||||
|
||||
parser.add_argument("--n_epoch", type=int, default=3,
|
||||
help="Number of pass on the whole dataset.")
|
||||
parser.add_argument("--batch_size", type=int, default=5,
|
||||
help="Batch size (for each process).")
|
||||
parser.add_argument("--tokens_per_batch", type=int, default=-1,
|
||||
help="If specified, modify the batches so that they have approximately this number of tokens.")
|
||||
parser.add_argument("--shuffle", action='store_false',
|
||||
help="If true, shuffle the sequence order. Default is true.")
|
||||
parser.add_argument("--group_by_size", action='store_false',
|
||||
help="If true, group sequences that have similar length into the same batch. Default is true.")
|
||||
|
||||
parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
|
||||
help="Gradient accumulation for larger training batches.")
|
||||
parser.add_argument("--warmup_prop", default=0.05, type=float,
|
||||
help="Linear warmup proportion.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
parser.add_argument("--learning_rate", default=5e-4, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-6, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=5.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--initializer_range", default=0.02, type=float,
|
||||
help="Random initialization range.")
|
||||
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--n_gpu", type=int, default=1,
|
||||
help="Number of GPUs in the node.")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="Distributed training - Local rank")
|
||||
parser.add_argument("--seed", type=int, default=56,
|
||||
help="Random seed")
|
||||
|
||||
parser.add_argument("--log_interval", type=int, default=500,
|
||||
help="Tensorboard logging interval.")
|
||||
parser.add_argument("--checkpoint_interval", type=int, default=4000,
|
||||
help="Checkpoint interval.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
## ARGS ##
|
||||
init_gpu_params(args)
|
||||
set_seed(args)
|
||||
if args.is_master:
|
||||
if os.path.exists(args.dump_path):
|
||||
if not args.force:
|
||||
raise ValueError(f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
|
||||
'Use `--force` if you want to overwrite it')
|
||||
else:
|
||||
shutil.rmtree(args.dump_path)
|
||||
|
||||
if not os.path.exists(args.dump_path):
|
||||
os.makedirs(args.dump_path)
|
||||
logger.info(f'Experiment will be dumped and logged in {args.dump_path}')
|
||||
|
||||
|
||||
### SAVE PARAMS ###
|
||||
logger.info(f'Param: {args}')
|
||||
with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
git_log(args.dump_path)
|
||||
assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \
|
||||
(args.from_pretrained_weights is not None and args.from_pretrained_config is not None)
|
||||
|
||||
|
||||
### TOKENIZER ###
|
||||
bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model)
|
||||
special_tok_ids = {}
|
||||
for tok_name, tok_symbol in bert_tokenizer.special_tokens_map.items():
|
||||
idx = bert_tokenizer.all_special_tokens.index(tok_symbol)
|
||||
special_tok_ids[tok_name] = bert_tokenizer.all_special_ids[idx]
|
||||
logger.info(f'Special tokens {special_tok_ids}')
|
||||
args.special_tok_ids = special_tok_ids
|
||||
|
||||
|
||||
## DATA LOADER ##
|
||||
logger.info(f'Loading data from {args.data_file}')
|
||||
with open(args.data_file, 'rb') as fp:
|
||||
data = pickle.load(fp)
|
||||
|
||||
|
||||
assert os.path.isfile(args.token_counts)
|
||||
logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
|
||||
with open(args.token_counts, 'rb') as fp:
|
||||
counts = pickle.load(fp)
|
||||
assert len(counts) == args.vocab_size
|
||||
token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
|
||||
for idx in special_tok_ids.values():
|
||||
token_probs[idx] = 0. # do not predict special tokens
|
||||
token_probs = torch.from_numpy(token_probs)
|
||||
|
||||
|
||||
train_dataloader = Dataset(params=args, data=data)
|
||||
logger.info(f'Data loader created.')
|
||||
|
||||
|
||||
## STUDENT ##
|
||||
if args.from_pretrained_weights is not None:
|
||||
assert os.path.isfile(os.path.join(args.from_pretrained_weights))
|
||||
assert os.path.isfile(os.path.join(args.from_pretrained_config))
|
||||
logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
|
||||
logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
|
||||
stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
|
||||
student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
|
||||
config=stu_architecture_config)
|
||||
else:
|
||||
args.vocab_size_or_config_json_file = args.vocab_size
|
||||
stu_architecture_config = DistilBertConfig(**vars(args))
|
||||
student = DistilBertForMaskedLM(stu_architecture_config)
|
||||
|
||||
|
||||
if args.n_gpu > 0:
|
||||
student.to(f'cuda:{args.local_rank}')
|
||||
logger.info(f'Student loaded.')
|
||||
|
||||
|
||||
## TEACHER ##
|
||||
teacher = BertForMaskedLM.from_pretrained(args.bert_model)
|
||||
if args.n_gpu > 0:
|
||||
teacher.to(f'cuda:{args.local_rank}')
|
||||
logger.info(f'Teacher loaded from {args.bert_model}.')
|
||||
|
||||
## DISTILLER ##
|
||||
torch.cuda.empty_cache()
|
||||
distiller = Distiller(params=args,
|
||||
dataloader=train_dataloader,
|
||||
token_probs=token_probs,
|
||||
student=student,
|
||||
teacher=teacher)
|
||||
distiller.train()
|
||||
logger.info("Let's go get some drinks.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
129
examples/distillation/utils.py
Normal file
129
examples/distillation/utils.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Utils to train DistilBERT
|
||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
"""
|
||||
import git
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
import logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def git_log(folder_path: str):
|
||||
"""
|
||||
Log commit info.
|
||||
"""
|
||||
repo = git.Repo(search_parent_directories=True)
|
||||
repo_infos = {
|
||||
'repo_id': str(repo),
|
||||
'repo_sha': str(repo.head.object.hexsha),
|
||||
'repo_branch': str(repo.active_branch)
|
||||
}
|
||||
|
||||
with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
|
||||
json.dump(repo_infos, f, indent=4)
|
||||
|
||||
|
||||
def init_gpu_params(params):
|
||||
"""
|
||||
Handle single and multi-GPU / multi-node.
|
||||
"""
|
||||
if params.n_gpu <= 0:
|
||||
params.local_rank = 0
|
||||
params.master_port = -1
|
||||
params.is_master = True
|
||||
params.multi_gpu = False
|
||||
return
|
||||
|
||||
assert torch.cuda.is_available()
|
||||
|
||||
logger.info('Initializing GPUs')
|
||||
if params.n_gpu > 1:
|
||||
assert params.local_rank != -1
|
||||
|
||||
params.world_size = int(os.environ['WORLD_SIZE'])
|
||||
params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
|
||||
params.global_rank = int(os.environ['RANK'])
|
||||
|
||||
# number of nodes / node ID
|
||||
params.n_nodes = params.world_size // params.n_gpu_per_node
|
||||
params.node_id = params.global_rank // params.n_gpu_per_node
|
||||
params.multi_gpu = True
|
||||
|
||||
assert params.n_nodes == int(os.environ['N_NODES'])
|
||||
assert params.node_id == int(os.environ['NODE_RANK'])
|
||||
|
||||
# local job (single GPU)
|
||||
else:
|
||||
assert params.local_rank == -1
|
||||
|
||||
params.n_nodes = 1
|
||||
params.node_id = 0
|
||||
params.local_rank = 0
|
||||
params.global_rank = 0
|
||||
params.world_size = 1
|
||||
params.n_gpu_per_node = 1
|
||||
params.multi_gpu = False
|
||||
|
||||
# sanity checks
|
||||
assert params.n_nodes >= 1
|
||||
assert 0 <= params.node_id < params.n_nodes
|
||||
assert 0 <= params.local_rank <= params.global_rank < params.world_size
|
||||
assert params.world_size == params.n_nodes * params.n_gpu_per_node
|
||||
|
||||
# define whether this is the master process / if we are in multi-node distributed mode
|
||||
params.is_master = params.node_id == 0 and params.local_rank == 0
|
||||
params.multi_node = params.n_nodes > 1
|
||||
|
||||
# summary
|
||||
PREFIX = f"--- Global rank: {params.global_rank} - "
|
||||
logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes)
|
||||
logger.info(PREFIX + "Node ID : %i" % params.node_id)
|
||||
logger.info(PREFIX + "Local rank : %i" % params.local_rank)
|
||||
logger.info(PREFIX + "World size : %i" % params.world_size)
|
||||
logger.info(PREFIX + "GPUs per node : %i" % params.n_gpu_per_node)
|
||||
logger.info(PREFIX + "Master : %s" % str(params.is_master))
|
||||
logger.info(PREFIX + "Multi-node : %s" % str(params.multi_node))
|
||||
logger.info(PREFIX + "Multi-GPU : %s" % str(params.multi_gpu))
|
||||
logger.info(PREFIX + "Hostname : %s" % socket.gethostname())
|
||||
|
||||
# set GPU device
|
||||
torch.cuda.set_device(params.local_rank)
|
||||
|
||||
# initialize multi-GPU
|
||||
if params.multi_gpu:
|
||||
logger.info("Initializing PyTorch distributed")
|
||||
torch.distributed.init_process_group(
|
||||
init_method='env://',
|
||||
backend='nccl',
|
||||
)
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
"""
|
||||
Set the random seed.
|
||||
"""
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
@@ -235,8 +235,9 @@ def main():
|
||||
|
||||
# Prepare model
|
||||
model = BertForPreTraining.from_pretrained(args.bert_model)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
# We don't need to manually call model.half() following Apex's recommend
|
||||
# if args.fp16:
|
||||
# model.half()
|
||||
model.to(device)
|
||||
if args.local_rank != -1:
|
||||
try:
|
||||
@@ -257,25 +258,36 @@ def main():
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex.optimizers import FP16_Optimizer
|
||||
from apex.optimizers import FusedAdam
|
||||
# from apex.optimizers import FP16_Optimizer
|
||||
# from apex.optimizers import FusedAdam
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
bias_correction=False,
|
||||
max_grad_norm=1.0)
|
||||
if args.loss_scale == 0:
|
||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
else:
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||
else:
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
|
||||
# This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
|
||||
# because it's recommended for typical use by Apex. We can make it configured
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
|
||||
|
||||
# We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer
|
||||
|
||||
# optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||
# lr=args.learning_rate,
|
||||
# bias_correction=False,
|
||||
# max_grad_norm=1.0)
|
||||
# if args.loss_scale == 0:
|
||||
# optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
# else:
|
||||
# optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||
# else:
|
||||
# optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
|
||||
|
||||
global_step = 0
|
||||
logging.info("***** Running training *****")
|
||||
@@ -304,7 +316,10 @@ def main():
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
if args.fp16:
|
||||
optimizer.backward(loss)
|
||||
# I depricate FP16_Optimizer's backward func and replace as Apex document
|
||||
# optimizer.backward(loss)
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
tr_loss += loss.item()
|
||||
@@ -314,15 +329,16 @@ def main():
|
||||
mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
|
||||
pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
scheduler.step() # Update learning rate schedule
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Save a trained model
|
||||
if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 :
|
||||
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
|
||||
logging.info("** ** * Saving fine-tuned model ** ** * ")
|
||||
model.save_pretrained(args.output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
|
||||
|
||||
@@ -507,7 +507,7 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
if not os.path.exists(args.output_dir) and ( n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 ):
|
||||
if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
@@ -602,15 +602,16 @@ def main():
|
||||
nb_tr_examples += input_ids.size(0)
|
||||
nb_tr_steps += 1
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
scheduler.step() # Update learning rate schedule
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Save a trained model
|
||||
if args.do_train and ( n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1):
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
logger.info("** ** * Saving fine - tuned model ** ** * ")
|
||||
model.save_pretrained(args.output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
|
||||
|
||||
@@ -211,10 +211,12 @@ def prune_heads(args, model, eval_dataloader, head_mask):
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||
parser.add_argument("--model_name", default=None, type=str, required=True,
|
||||
help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
|
||||
ALL_MODELS))
|
||||
parser.add_argument("--task_name", default=None, type=str, required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
@@ -222,9 +224,9 @@ def main():
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
help="Pretrained config name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||
help="Pretrained tokenizer name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--data_subset", type=int, default=-1,
|
||||
@@ -297,15 +299,15 @@ def main():
|
||||
|
||||
args.model_type = ""
|
||||
for key in MODEL_CLASSES:
|
||||
if key in args.model_name.lower():
|
||||
if key in args.model_name_or_path.lower():
|
||||
args.model_type = key # take the first match in model types
|
||||
break
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name,
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels, finetuning_task=args.task_name,
|
||||
output_attentions=True)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name)
|
||||
model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path)
|
||||
model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
@@ -128,7 +128,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
|
||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids
|
||||
'labels': batch[3]}
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
||||
@@ -251,7 +251,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
|
||||
|
||||
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
if args.local_rank not in [-1, 0]:
|
||||
if args.local_rank not in [-1, 0] and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
processor = processors[task]()
|
||||
@@ -279,14 +279,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
||||
pad_token=tokenizer.encoder[tokenizer.pad_token] if args.model_type in ['roberta'] else tokenizer.vocab[tokenizer.pad_token],
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
|
||||
if args.local_rank == 0:
|
||||
if args.local_rank == 0 and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
@@ -467,13 +467,14 @@ def main():
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
|
||||
497
examples/run_lm_finetuning.py
Normal file
497
examples/run_lm_finetuning.py
Normal file
@@ -0,0 +1,497 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
|
||||
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
|
||||
using a masked language modeling (MLM) loss.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tensorboardX import SummaryWriter
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
|
||||
BertConfig, BertForMaskedLM, BertTokenizer,
|
||||
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
||||
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
||||
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
||||
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
|
||||
}
|
||||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
def __init__(self, tokenizer, file_path='train', block_size=512):
|
||||
assert os.path.isfile(file_path)
|
||||
directory, filename = os.path.split(file_path)
|
||||
cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
|
||||
|
||||
if os.path.exists(cached_features_file):
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
with open(cached_features_file, 'rb') as handle:
|
||||
self.examples = pickle.load(handle)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", directory)
|
||||
|
||||
self.examples = []
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||
|
||||
while len(tokenized_text) >= block_size: # Truncate in block of block_size
|
||||
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
|
||||
tokenized_text = tokenized_text[block_size:]
|
||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||
# can change this behavior by adding (model specific) padding.
|
||||
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
with open(cached_features_file, 'wb') as handle:
|
||||
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return torch.tensor(self.examples[item])
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False):
|
||||
dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
|
||||
return dataset
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def mask_tokens(inputs, tokenizer, args):
|
||||
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
|
||||
labels = inputs.clone()
|
||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||
masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool()
|
||||
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
||||
|
||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
|
||||
inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
|
||||
|
||||
# 10% of the time, we replace masked input tokens with random word
|
||||
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
|
||||
random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
|
||||
inputs[indices_random] = random_words[indices_random]
|
||||
|
||||
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer):
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer = SummaryWriter()
|
||||
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
else:
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
|
||||
inputs = inputs.to(args.device)
|
||||
labels = labels.to(args.device)
|
||||
model.train()
|
||||
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
|
||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
if args.fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer.close()
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||
eval_output_dir = args.output_dir
|
||||
|
||||
results = {}
|
||||
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
|
||||
|
||||
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(eval_output_dir)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
# Note that DistributedSampler samples randomly
|
||||
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = %d", len(eval_dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
eval_loss = 0.0
|
||||
nb_eval_steps = 0
|
||||
model.eval()
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
batch = batch.to(args.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(batch, masked_lm_labels=batch) if args.mlm else model(batch, labels=batch)
|
||||
lm_loss = outputs[0]
|
||||
eval_loss += lm_loss.mean().item()
|
||||
nb_eval_steps += 1
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
perplexity = torch.exp(torch.tensor(eval_loss))
|
||||
|
||||
result = {
|
||||
"perplexity": perplexity
|
||||
}
|
||||
|
||||
output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results {} *****".format(prefix))
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--train_data_file", default=None, type=str, required=True,
|
||||
help="The input training data file (a text file).")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--eval_data_file", default=None, type=str,
|
||||
help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
|
||||
|
||||
parser.add_argument("--model_type", default="bert", type=str,
|
||||
help="The model architecture to be fine-tuned.")
|
||||
parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
|
||||
help="The model checkpoint for weights initialization.")
|
||||
|
||||
parser.add_argument("--mlm", action='store_true',
|
||||
help="Train with masked-language modeling loss instead of language modeling.")
|
||||
parser.add_argument("--mlm_probability", type=float, default=0.15,
|
||||
help="Ratio of tokens to mask for masked language modeling loss")
|
||||
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Optional pretrained config name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
|
||||
parser.add_argument("--block_size", default=-1, type=int,
|
||||
help="Optional input sequence length after tokenization."
|
||||
"The training dataset will be truncated in block of this size for training."
|
||||
"Default to the model max input length for single sentence inputs (take into account special tokens).")
|
||||
parser.add_argument("--do_train", action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||
help="Run evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=1.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument('--save_steps', type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Avoid using CUDA when available")
|
||||
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="For distributed training: local_rank")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.model_type in ["bert", "roberta"] and not args.mlm:
|
||||
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||
"flag (masked language modeling).")
|
||||
if args.eval_data_file is None and args.do_eval:
|
||||
raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
|
||||
"or remove the --do_eval argument.")
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
# Setup CUDA, GPU & distributed training
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab
|
||||
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
|
||||
if args.block_size <= 0:
|
||||
args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model
|
||||
args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
|
||||
model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
|
||||
model.to(args.device)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Training
|
||||
if args.do_train:
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier()
|
||||
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
|
||||
# Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, prefix=global_step)
|
||||
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -157,8 +157,8 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
scheduler.step() # Update learning rate schedule
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
@@ -272,7 +272,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
||||
if args.local_rank not in [-1, 0]:
|
||||
if args.local_rank not in [-1, 0] and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
@@ -299,7 +299,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
|
||||
if args.local_rank == 0:
|
||||
if args.local_rank == 0 and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
@@ -481,7 +481,7 @@ def main():
|
||||
|
||||
|
||||
# Save the trained model and the tokenizer
|
||||
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir)
|
||||
@@ -498,7 +498,7 @@ def main():
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ class ExamplesTests(unittest.TestCase):
|
||||
"--do_train",
|
||||
"--do_eval",
|
||||
"--version_2_with_negative",
|
||||
"--learning_rate=1e-4",
|
||||
"--learning_rate=2e-4",
|
||||
"--per_gpu_train_batch_size=2",
|
||||
"--per_gpu_eval_batch_size=1",
|
||||
"--overwrite_output_dir",
|
||||
|
||||
@@ -422,12 +422,14 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
||||
tokens_b = tokenizer.tokenize(example.text_b)
|
||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
||||
# length is less than the specified length.
|
||||
# Account for [CLS], [SEP], [SEP] with "- 3"
|
||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
|
||||
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
|
||||
special_tokens_count = 4 if sep_token_extra else 3
|
||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
|
||||
else:
|
||||
# Account for [CLS] and [SEP] with "- 2"
|
||||
if len(tokens_a) > max_seq_length - 2:
|
||||
tokens_a = tokens_a[:(max_seq_length - 2)]
|
||||
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
|
||||
special_tokens_count = 3 if sep_token_extra else 2
|
||||
if len(tokens_a) > max_seq_length - special_tokens_count:
|
||||
tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
|
||||
140
hubconf.py
140
hubconf.py
@@ -1,30 +1,112 @@
|
||||
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
|
||||
from pytorch_transformers import (
|
||||
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
|
||||
)
|
||||
from pytorch_transformers.modeling_utils import add_start_docstrings
|
||||
|
||||
from hubconfs.bert_hubconf import (
|
||||
bertTokenizer,
|
||||
bertModel,
|
||||
bertForNextSentencePrediction,
|
||||
bertForPreTraining,
|
||||
bertForMaskedLM,
|
||||
bertForSequenceClassification,
|
||||
bertForMultipleChoice,
|
||||
bertForQuestionAnswering,
|
||||
bertForTokenClassification
|
||||
)
|
||||
from hubconfs.gpt_hubconf import (
|
||||
openAIGPTTokenizer,
|
||||
openAIGPTModel,
|
||||
openAIGPTLMHeadModel,
|
||||
openAIGPTDoubleHeadsModel
|
||||
)
|
||||
from hubconfs.gpt2_hubconf import (
|
||||
gpt2Tokenizer,
|
||||
gpt2Model,
|
||||
gpt2LMHeadModel,
|
||||
gpt2DoubleHeadsModel
|
||||
)
|
||||
from hubconfs.transformer_xl_hubconf import (
|
||||
transformerXLTokenizer,
|
||||
transformerXLModel,
|
||||
transformerXLLMHeadModel
|
||||
)
|
||||
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
|
||||
|
||||
@add_start_docstrings(AutoConfig.__doc__)
|
||||
def config(*args, **kwargs):
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.
|
||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
|
||||
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
|
||||
assert config.output_attention == True
|
||||
config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
|
||||
assert config.output_attention == True
|
||||
assert unused_kwargs == {'foo': False}
|
||||
|
||||
"""
|
||||
|
||||
return AutoConfig.from_pretrained(*args, **kwargs)
|
||||
|
||||
|
||||
@add_start_docstrings(AutoTokenizer.__doc__)
|
||||
def tokenizer(*args, **kwargs):
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||
|
||||
"""
|
||||
|
||||
return AutoTokenizer.from_pretrained(*args, **kwargs)
|
||||
|
||||
|
||||
@add_start_docstrings(AutoModel.__doc__)
|
||||
def model(*args, **kwargs):
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
|
||||
return AutoModel.from_pretrained(*args, **kwargs)
|
||||
|
||||
@add_start_docstrings(AutoModelWithLMHead.__doc__)
|
||||
def modelWithLMHead(*args, **kwargs):
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
|
||||
|
||||
|
||||
@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
|
||||
def modelForSequenceClassification(*args, **kwargs):
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
|
||||
return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
|
||||
|
||||
|
||||
@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
|
||||
def modelForQuestionAnswering(*args, **kwargs):
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
|
||||
|
||||
@@ -1,360 +0,0 @@
|
||||
from pytorch_transformers.tokenization_bert import BertTokenizer
|
||||
from pytorch_transformers.modeling_bert import (
|
||||
BertModel,
|
||||
BertForNextSentencePrediction,
|
||||
BertForMaskedLM,
|
||||
BertForMultipleChoice,
|
||||
BertForPreTraining,
|
||||
BertForQuestionAnswering,
|
||||
BertForSequenceClassification,
|
||||
BertForTokenClassification,
|
||||
)
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
bert_docstring = """
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load
|
||||
. `bert-base-uncased`
|
||||
. `bert-large-uncased`
|
||||
. `bert-base-cased`
|
||||
. `bert-large-cased`
|
||||
. `bert-base-multilingual-uncased`
|
||||
. `bert-base-multilingual-cased`
|
||||
. `bert-base-chinese`
|
||||
. `bert-base-german-cased`
|
||||
. `bert-large-uncased-whole-word-masking`
|
||||
. `bert-large-cased-whole-word-masking`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `bert_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
|
||||
instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `bert_config.json` a configuration file for the model
|
||||
. `model.chkpt` a TensorFlow checkpoint
|
||||
from_tf: should we load the weights from a locally saved TensorFlow
|
||||
checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models
|
||||
will be cached.
|
||||
state_dict: an optional state dictionary
|
||||
(collections.OrderedDict object) to use instead of Google
|
||||
pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific Bert class
|
||||
(ex: num_labels for BertForSequenceClassification)
|
||||
"""
|
||||
|
||||
|
||||
def _append_from_pretrained_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def bertTokenizer(*args, **kwargs):
|
||||
"""
|
||||
Instantiate a BertTokenizer from a pre-trained/customized vocab file
|
||||
Args:
|
||||
pretrained_model_name_or_path: Path to pretrained model archive
|
||||
or one of pre-trained vocab configs below.
|
||||
* bert-base-uncased
|
||||
* bert-large-uncased
|
||||
* bert-base-cased
|
||||
* bert-large-cased
|
||||
* bert-base-multilingual-uncased
|
||||
* bert-base-multilingual-cased
|
||||
* bert-base-chinese
|
||||
Keyword args:
|
||||
cache_dir: an optional path to a specific directory to download and cache
|
||||
the pre-trained model weights.
|
||||
Default: None
|
||||
do_lower_case: Whether to lower case the input.
|
||||
Only has an effect when do_wordpiece_only=False
|
||||
Default: True
|
||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
|
||||
Default: True
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
||||
Effective maximum length is always the minimum of this
|
||||
value (if specified) and the underlying BERT model's
|
||||
sequence length.
|
||||
Default: None
|
||||
never_split: List of tokens which will never be split during tokenization.
|
||||
Only has an effect when do_wordpiece_only=False
|
||||
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
|
||||
|
||||
Example:
|
||||
import torch
|
||||
sentence = 'Hello, World!'
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
toks = tokenizer.tokenize(sentence)
|
||||
['Hello', '##,', 'World', '##!']
|
||||
ids = tokenizer.convert_tokens_to_ids(toks)
|
||||
[8667, 28136, 1291, 28125]
|
||||
"""
|
||||
tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertModel(*args, **kwargs):
|
||||
"""
|
||||
BertModel is the basic BERT Transformer model with a layer of summed token,
|
||||
position and sequence embeddings followed by a series of identical
|
||||
self-attention blocks (12 for BERT-base, 24 for BERT-large).
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
# Load bertModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
|
||||
model.eval()
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
encoded_layers, _ = model(tokens_tensor, segments_tensors)
|
||||
"""
|
||||
model = BertModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertForNextSentencePrediction(*args, **kwargs):
|
||||
"""
|
||||
BERT model with next sentence prediction head.
|
||||
This module comprises the BERT model followed by the next sentence
|
||||
classification head.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
# Load bertForNextSentencePrediction
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
|
||||
model.eval()
|
||||
# Predict the next sentence classification logits
|
||||
with torch.no_grad():
|
||||
next_sent_classif_logits = model(tokens_tensor, segments_tensors)
|
||||
"""
|
||||
model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertForPreTraining(*args, **kwargs):
|
||||
"""
|
||||
BERT model with pre-training heads.
|
||||
This module comprises the BERT model followed by the two pre-training heads
|
||||
- the masked language modeling head, and
|
||||
- the next sentence classification head.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
# Load bertForPreTraining
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
|
||||
masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
|
||||
"""
|
||||
model = BertForPreTraining.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertForMaskedLM(*args, **kwargs):
|
||||
"""
|
||||
BertForMaskedLM includes the BertModel Transformer followed by the
|
||||
(possibly) pre-trained masked language modeling head.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
masked_index = 8
|
||||
tokenized_text[masked_index] = '[MASK]'
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
# Load bertForMaskedLM
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
|
||||
model.eval()
|
||||
# Predict all tokens
|
||||
with torch.no_grad():
|
||||
predictions = model(tokens_tensor, segments_tensors)
|
||||
predicted_index = torch.argmax(predictions[0, masked_index]).item()
|
||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
||||
'henson'
|
||||
"""
|
||||
model = BertForMaskedLM.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertForSequenceClassification(*args, **kwargs):
|
||||
"""
|
||||
BertForSequenceClassification is a fine-tuning model that includes
|
||||
BertModel and a sequence-level (sequence or pair of sequences) classifier
|
||||
on top of the BertModel. Note that the classification head is only initialized
|
||||
and has to be trained.
|
||||
|
||||
The sequence-level classifier is a linear layer that takes as input the
|
||||
last hidden state of the first character in the input sequence
|
||||
(see Figures 3a and 3b in the BERT paper).
|
||||
|
||||
Args:
|
||||
num_labels: the number (>=2) of classes for the classifier.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
# Load bertForSequenceClassification
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
|
||||
model.eval()
|
||||
# Predict the sequence classification logits
|
||||
with torch.no_grad():
|
||||
seq_classif_logits = model(tokens_tensor, segments_tensors)
|
||||
# Or get the sequence classification loss
|
||||
labels = torch.tensor([1])
|
||||
seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
|
||||
"""
|
||||
model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertForMultipleChoice(*args, **kwargs):
|
||||
"""
|
||||
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
|
||||
linear layer on top of the BertModel. Note that the multiple choice head is
|
||||
only initialized and has to be trained.
|
||||
|
||||
Args:
|
||||
num_choices: the number (>=2) of classes for the classifier.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
|
||||
segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
|
||||
# Load bertForMultipleChoice
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
|
||||
model.eval()
|
||||
# Predict the multiple choice logits
|
||||
with torch.no_grad():
|
||||
multiple_choice_logits = model(tokens_tensor, segments_tensors)
|
||||
# Or get the multiple choice loss
|
||||
labels = torch.tensor([1])
|
||||
multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
|
||||
"""
|
||||
model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertForQuestionAnswering(*args, **kwargs):
|
||||
"""
|
||||
BertForQuestionAnswering is a fine-tuning model that includes BertModel
|
||||
with a token-level classifiers on top of the full sequence of last hidden
|
||||
states. Note that the classification head is only initialized
|
||||
and has to be trained.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
# Load bertForQuestionAnswering
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
|
||||
model.eval()
|
||||
# Predict the start and end positions logits
|
||||
with torch.no_grad():
|
||||
start_logits, end_logits = model(tokens_tensor, segments_tensors)
|
||||
# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
|
||||
start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
|
||||
# set model.train() before if training this loss
|
||||
multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
|
||||
"""
|
||||
model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(bert_docstring)
|
||||
def bertForTokenClassification(*args, **kwargs):
|
||||
"""
|
||||
BertForTokenClassification is a fine-tuning model that includes BertModel
|
||||
and a token-level classifier on top of the BertModel. Note that the classification
|
||||
head is only initialized and has to be trained.
|
||||
|
||||
The token-level classifier is a linear layer that takes as input the last
|
||||
hidden state of the sequence.
|
||||
|
||||
Args:
|
||||
num_labels: the number (>=2) of classes for the classifier.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
||||
# Prepare tokenized input
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
# Load bertForTokenClassification
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
|
||||
model.eval()
|
||||
# Predict the token classification logits
|
||||
with torch.no_grad():
|
||||
classif_logits = model(tokens_tensor, segments_tensors)
|
||||
# Or get the token classification loss
|
||||
labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
|
||||
classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
|
||||
"""
|
||||
model = BertForTokenClassification.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
@@ -1,168 +0,0 @@
|
||||
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
|
||||
from pytorch_transformers.modeling_gpt2 import (
|
||||
GPT2Model,
|
||||
GPT2LMHeadModel,
|
||||
GPT2DoubleHeadsModel
|
||||
)
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
gpt2_docstring = """
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `gpt2`, `gpt2-medium`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `gpt2_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `gpt2_config.json` a configuration file for the model
|
||||
. a TensorFlow checkpoint with trained weights
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific GPT-2 class
|
||||
"""
|
||||
|
||||
|
||||
def _append_from_pretrained_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def gpt2Tokenizer(*args, **kwargs):
|
||||
"""
|
||||
Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
|
||||
Peculiarities:
|
||||
- Byte-level BPE
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path: Path to pretrained model archive
|
||||
or one of pre-trained vocab configs below.
|
||||
* gpt2
|
||||
Keyword args:
|
||||
special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
|
||||
Default: None
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
||||
Effective maximum length is always the minimum of this
|
||||
value (if specified) and the underlying BERT model's
|
||||
sequence length.
|
||||
Default: None
|
||||
|
||||
Example:
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
||||
|
||||
text = "Who was Jim Henson ?"
|
||||
indexed_tokens = tokenizer.encode(tokenized_text)
|
||||
"""
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(gpt2_docstring)
|
||||
def gpt2Model(*args, **kwargs):
|
||||
"""
|
||||
gpt2Model is the basic OpenAI GPT-2 Transformer model based on
|
||||
identical stacked masked self-attention blocks and pre-trained
|
||||
on large scale dataset using language modeling signal.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
||||
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load gpt2Model
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
# past can be used to reuse precomputed hidden state in a subsequent predictions
|
||||
with torch.no_grad():
|
||||
hidden_states_1, past = model(tokens_tensor_1)
|
||||
hidden_states_2, past = model(tokens_tensor_2, past=past)
|
||||
"""
|
||||
model = GPT2Model.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(gpt2_docstring)
|
||||
def gpt2LMHeadModel(*args, **kwargs):
|
||||
"""
|
||||
gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
|
||||
tied (pre-trained) language modeling head on top.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
||||
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load gpt2LMHeadModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
# past can be used to reuse precomputed hidden state in a subsequent predictions
|
||||
with torch.no_grad():
|
||||
predictions_1, past = model(tokens_tensor_1)
|
||||
predictions_2, past = model(tokens_tensor_2, past=past)
|
||||
|
||||
# Get the predicted last token
|
||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
||||
predicted_token = tokenizer.decode([predicted_index])
|
||||
assert predicted_token == ' who'
|
||||
"""
|
||||
model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(gpt2_docstring)
|
||||
def gpt2DoubleHeadsModel(*args, **kwargs):
|
||||
"""
|
||||
gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
|
||||
tied (pre-trained) language modeling head and a multiple choice
|
||||
classification head (only initialized, not pre-trained).
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
||||
|
||||
# Prepare tokenized input
|
||||
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
||||
tokenized_text1 = tokenizer.tokenize(text1)
|
||||
tokenized_text2 = tokenizer.tokenize(text2)
|
||||
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
||||
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
||||
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
||||
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
||||
|
||||
# Load gpt2DoubleHeadsModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
|
||||
"""
|
||||
model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
@@ -1,186 +0,0 @@
|
||||
from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
|
||||
from pytorch_transformers.modeling_openai import (
|
||||
OpenAIGPTModel,
|
||||
OpenAIGPTLMHeadModel,
|
||||
OpenAIGPTDoubleHeadsModel
|
||||
)
|
||||
|
||||
# Dependecies that are not specified in global hubconf.py
|
||||
specific_dependencies = ['spacy', 'ftfy']
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
gpt_docstring = """
|
||||
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
|
||||
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
|
||||
Special tokens need to be trained during the fine-tuning if you use them.
|
||||
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
|
||||
|
||||
The embeddings are ordered as follow in the token embeddings matrice:
|
||||
[0, ----------------------
|
||||
... -> word embeddings
|
||||
config.vocab_size - 1, ______________________
|
||||
config.vocab_size,
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
|
||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `openai-gpt`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `openai_gpt_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `openai-gpt-config.json` a configuration file for the model
|
||||
. a series of NumPy files containing OpenAI TensorFlow trained weights
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionary (collections.OrderedDict object)
|
||||
to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
|
||||
"""
|
||||
|
||||
|
||||
def _append_from_pretrained_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def openAIGPTTokenizer(*args, **kwargs):
|
||||
"""
|
||||
Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
|
||||
Peculiarities:
|
||||
- lower case all inputs
|
||||
- uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
|
||||
- argument special_tokens and function set_special_tokens:
|
||||
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path: Path to pretrained model archive
|
||||
or one of pre-trained vocab configs below.
|
||||
* openai-gpt
|
||||
Keyword args:
|
||||
special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
|
||||
Default: None
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
||||
Effective maximum length is always the minimum of this
|
||||
value (if specified) and the underlying BERT model's
|
||||
sequence length.
|
||||
Default: None
|
||||
|
||||
Example:
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
||||
|
||||
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
[763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
|
||||
"""
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(gpt_docstring)
|
||||
def openAIGPTModel(*args, **kwargs):
|
||||
"""
|
||||
OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
|
||||
identical stacked masked self-attention blocks and pre-trained
|
||||
on large scale dataset using language modeling signal.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
||||
|
||||
# Prepare tokenized input
|
||||
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
|
||||
# Load openAIGPTModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
hidden_states = model(tokens_tensor)
|
||||
"""
|
||||
model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(gpt_docstring)
|
||||
def openAIGPTLMHeadModel(*args, **kwargs):
|
||||
"""
|
||||
OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
|
||||
tied (pre-trained) language modeling head on top.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
||||
|
||||
# Prepare tokenized input
|
||||
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
|
||||
# Load openAIGPTLMHeadModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
predictions = model(tokens_tensor)
|
||||
|
||||
# Get the predicted last token
|
||||
predicted_index = torch.argmax(predictions[0, -1, :]).item()
|
||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
||||
'.</w>'
|
||||
"""
|
||||
model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(gpt_docstring)
|
||||
def openAIGPTDoubleHeadsModel(*args, **kwargs):
|
||||
"""
|
||||
OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
|
||||
tied (pre-trained) language modeling head and a multiple choice
|
||||
classification head (only initialized, not pre-trained).
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
||||
|
||||
# Prepare tokenized input
|
||||
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
||||
tokenized_text1 = tokenizer.tokenize(text1)
|
||||
tokenized_text2 = tokenizer.tokenize(text2)
|
||||
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
||||
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
||||
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
||||
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
||||
|
||||
# Load openAIGPTDoubleHeadsModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
|
||||
"""
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
@@ -1,130 +0,0 @@
|
||||
from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
|
||||
from pytorch_transformers.modeling_transfo_xl import (
|
||||
TransfoXLModel,
|
||||
TransfoXLLMHeadModel
|
||||
)
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
transformer_xl_docstring = """
|
||||
Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
|
||||
- you don't need to specify positioning embeddings indices
|
||||
- the tokens in the vocabulary have to be sorted to decreasing frequency.
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `transfo-xl-wt103`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `transfo_xl_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `transfo_xl_config.json` a configuration file for the model
|
||||
. `model.chkpt` a TensorFlow checkpoint
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific TransformerXL class
|
||||
"""
|
||||
|
||||
|
||||
def _append_from_pretrained_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def transformerXLTokenizer(*args, **kwargs):
|
||||
"""
|
||||
Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path: Path to pretrained model archive
|
||||
or one of pre-trained vocab configs below.
|
||||
* transfo-xl-wt103
|
||||
|
||||
Example:
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
|
||||
|
||||
text = "Who was Jim Henson ?"
|
||||
tokenized_text = tokenizer.tokenize(tokenized_text)
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
"""
|
||||
tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(transformer_xl_docstring)
|
||||
def transformerXLModel(*args, **kwargs):
|
||||
"""
|
||||
transformerXLModel is the basic Transformer XL model.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
|
||||
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
tokenized_text_1 = tokenizer.tokenize(text_1)
|
||||
tokenized_text_2 = tokenizer.tokenize(text_2)
|
||||
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
|
||||
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load transformerXLModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
# We can re-use the memory cells in a subsequent call to attend a longer context
|
||||
with torch.no_grad():
|
||||
hidden_states_1, mems_1 = model(tokens_tensor_1)
|
||||
hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
|
||||
"""
|
||||
model = TransfoXLModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(transformer_xl_docstring)
|
||||
def transformerXLLMHeadModel(*args, **kwargs):
|
||||
"""
|
||||
transformerXLModel is the basic Transformer XL model with the
|
||||
tied (pre-trained) language modeling head on top.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
|
||||
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
tokenized_text_1 = tokenizer.tokenize(text_1)
|
||||
tokenized_text_2 = tokenizer.tokenize(text_2)
|
||||
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
|
||||
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load transformerXLLMHeadModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
# We can re-use the memory cells in a subsequent call to attend a longer context
|
||||
with torch.no_grad():
|
||||
predictions_1, mems_1 = model(tokens_tensor_1)
|
||||
predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
|
||||
|
||||
# Get the predicted last token
|
||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
||||
assert predicted_token == 'who'
|
||||
"""
|
||||
model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
@@ -1,167 +0,0 @@
|
||||
from pytorch_transformers.tokenization_xlm import XLMTokenizer
|
||||
from pytorch_transformers.modeling_xlm import (
|
||||
XLMConfig,
|
||||
XLMModel,
|
||||
XLMWithLMHeadModel,
|
||||
XLMForSequenceClassification,
|
||||
XLMForQuestionAnswering
|
||||
)
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
xlm_start_docstring = """
|
||||
Model class adapted from the XLM Transformer model of
|
||||
"Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
|
||||
Paper: https://arxiv.org/abs/1901.07291
|
||||
Original code: https://github.com/facebookresearch/XLM
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
|
||||
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
"""
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
xlm_end_docstring = """
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `xlm-mlm-en-2048`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific XLM class
|
||||
"""
|
||||
|
||||
|
||||
def _begin_with_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
def _end_with_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def xlmTokenizer(*args, **kwargs):
|
||||
"""
|
||||
Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path: Path to pretrained model archive
|
||||
or one of pre-trained vocab configs below.
|
||||
* xlm-mlm-en-2048
|
||||
Keyword args:
|
||||
special_tokens: Special tokens in vocabulary that are not pretrained
|
||||
Default: None
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
||||
Effective maximum length is always the minimum of this
|
||||
value (if specified) and the underlying model's
|
||||
sequence length.
|
||||
Default: None
|
||||
|
||||
Example:
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
|
||||
|
||||
text = "Who was Jim Henson ?"
|
||||
indexed_tokens = tokenizer.encode(tokenized_text)
|
||||
"""
|
||||
tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@_begin_with_docstring(xlm_start_docstring)
|
||||
@_end_with_docstring(xlm_end_docstring)
|
||||
def xlmModel(*args, **kwargs):
|
||||
"""
|
||||
# Load xlmModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
hidden_states_1, mems = model(tokens_tensor_1)
|
||||
hidden_states_2, mems = model(tokens_tensor_2, past=mems)
|
||||
"""
|
||||
model = XLMModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_begin_with_docstring(xlm_start_docstring)
|
||||
@_end_with_docstring(xlm_end_docstring)
|
||||
def xlmLMHeadModel(*args, **kwargs):
|
||||
"""
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load xlnetLMHeadModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
predictions_1, mems = model(tokens_tensor_1)
|
||||
predictions_2, mems = model(tokens_tensor_2, mems=mems)
|
||||
|
||||
# Get the predicted last token
|
||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
||||
predicted_token = tokenizer.decode([predicted_index])
|
||||
assert predicted_token == ' who'
|
||||
"""
|
||||
model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
# @_end_with_docstring(xlnet_docstring)
|
||||
# def xlnetForSequenceClassification(*args, **kwargs):
|
||||
# """
|
||||
# xlnetModel is the basic XLNet Transformer model from
|
||||
# "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
||||
# by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
||||
|
||||
# Example:
|
||||
# # Load the tokenizer
|
||||
# import torch
|
||||
# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
|
||||
|
||||
# # Prepare tokenized input
|
||||
# text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
# text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
||||
# tokenized_text1 = tokenizer.tokenize(text1)
|
||||
# tokenized_text2 = tokenizer.tokenize(text2)
|
||||
# indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
||||
# indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
||||
# tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
||||
# mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
||||
|
||||
# # Load xlnetForSequenceClassification
|
||||
# model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
|
||||
# model.eval()
|
||||
|
||||
# # Predict sequence classes logits
|
||||
# with torch.no_grad():
|
||||
# lm_logits, mems = model(tokens_tensor)
|
||||
# """
|
||||
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
|
||||
# return model
|
||||
@@ -1,169 +0,0 @@
|
||||
from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
|
||||
from pytorch_transformers.modeling_xlnet import (
|
||||
XLNetConfig,
|
||||
XLNetModel,
|
||||
XLNetLMHeadModel,
|
||||
# XLNetForSequenceClassification
|
||||
)
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
xlnet_docstring = """
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `xlnet-large-cased`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `xlnet_config.json` a configuration file for the model
|
||||
. `model.chkpt` a TensorFlow checkpoint
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific XLNet class
|
||||
"""
|
||||
|
||||
|
||||
def _append_from_pretrained_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def xlnetTokenizer(*args, **kwargs):
|
||||
"""
|
||||
Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
|
||||
Peculiarities:
|
||||
- require Google sentencepiece (https://github.com/google/sentencepiece)
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path: Path to pretrained model archive
|
||||
or one of pre-trained vocab configs below.
|
||||
* xlnet-large-cased
|
||||
Keyword args:
|
||||
special_tokens: Special tokens in vocabulary that are not pretrained
|
||||
Default: None
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
||||
Effective maximum length is always the minimum of this
|
||||
value (if specified) and the underlying model's
|
||||
sequence length.
|
||||
Default: None
|
||||
|
||||
Example:
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
text = "Who was Jim Henson ?"
|
||||
indexed_tokens = tokenizer.encode(tokenized_text)
|
||||
"""
|
||||
tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(xlnet_docstring)
|
||||
def xlnetModel(*args, **kwargs):
|
||||
"""
|
||||
xlnetModel is the basic XLNet Transformer model from
|
||||
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
||||
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load xlnetModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
hidden_states_1, mems = model(tokens_tensor_1)
|
||||
hidden_states_2, mems = model(tokens_tensor_2, past=mems)
|
||||
"""
|
||||
model = XLNetModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(xlnet_docstring)
|
||||
def xlnetLMHeadModel(*args, **kwargs):
|
||||
"""
|
||||
xlnetModel is the basic XLNet Transformer model from
|
||||
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
||||
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
||||
with a tied (pre-trained) language modeling head on top.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
import torch
|
||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
# Prepare tokenized input
|
||||
text_1 = "Who was Jim Henson ?"
|
||||
text_2 = "Jim Henson was a puppeteer"
|
||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load xlnetLMHeadModel
|
||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
|
||||
model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
predictions_1, mems = model(tokens_tensor_1)
|
||||
predictions_2, mems = model(tokens_tensor_2, mems=mems)
|
||||
|
||||
# Get the predicted last token
|
||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
||||
predicted_token = tokenizer.decode([predicted_index])
|
||||
assert predicted_token == ' who'
|
||||
"""
|
||||
model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
# @_append_from_pretrained_docstring(xlnet_docstring)
|
||||
# def xlnetForSequenceClassification(*args, **kwargs):
|
||||
# """
|
||||
# xlnetModel is the basic XLNet Transformer model from
|
||||
# "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
||||
# by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
||||
|
||||
# Example:
|
||||
# # Load the tokenizer
|
||||
# import torch
|
||||
# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
# # Prepare tokenized input
|
||||
# text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
# text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
||||
# tokenized_text1 = tokenizer.tokenize(text1)
|
||||
# tokenized_text2 = tokenizer.tokenize(text2)
|
||||
# indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
||||
# indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
||||
# tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
||||
# mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
||||
|
||||
# # Load xlnetForSequenceClassification
|
||||
# model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
|
||||
# model.eval()
|
||||
|
||||
# # Predict sequence classes logits
|
||||
# with torch.no_grad():
|
||||
# lm_logits, mems = model(tokens_tensor)
|
||||
# """
|
||||
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
|
||||
# return model
|
||||
@@ -1,4 +1,4 @@
|
||||
__version__ = "1.1.0"
|
||||
__version__ = "1.2.0"
|
||||
from .tokenization_auto import AutoTokenizer
|
||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
@@ -7,10 +7,12 @@ from .tokenization_gpt2 import GPT2Tokenizer
|
||||
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
||||
from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
from .tokenization_distilbert import DistilBertTokenizer
|
||||
|
||||
from .tokenization_utils import (PreTrainedTokenizer)
|
||||
|
||||
from .modeling_auto import (AutoConfig, AutoModel)
|
||||
from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
||||
AutoModelWithLMHead)
|
||||
|
||||
from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
|
||||
BertForMaskedLM, BertForNextSentencePrediction,
|
||||
@@ -40,6 +42,9 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
|
||||
XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
|
||||
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_distilbert import (DistilBertConfig, DistilBertForMaskedLM, DistilBertModel,
|
||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
|
||||
PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
|
||||
if gpt2_config_file == "":
|
||||
config = GPT2Config()
|
||||
else:
|
||||
config = GPT2Config(gpt2_config_file)
|
||||
config = GPT2Config.from_json_file(gpt2_config_file)
|
||||
model = GPT2Model(config)
|
||||
|
||||
# Load weights from numpy
|
||||
|
||||
@@ -35,7 +35,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
|
||||
if openai_config_file == "":
|
||||
config = OpenAIGPTConfig()
|
||||
else:
|
||||
config = OpenAIGPTConfig(openai_config_file)
|
||||
config = OpenAIGPTConfig.from_json_file(openai_config_file)
|
||||
model = OpenAIGPTModel(config)
|
||||
|
||||
# Load weights from numpy
|
||||
|
||||
@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
||||
intermediate_size=roberta.args.encoder_ffn_embed_dim,
|
||||
max_position_embeddings=514,
|
||||
type_vocab_size=1,
|
||||
layer_norm_eps=1e-5, # PyTorch default used in fairseq
|
||||
)
|
||||
if classification_head:
|
||||
config.num_labels = roberta.args.num_classes
|
||||
@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
||||
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
|
||||
model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
|
||||
model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
|
||||
model.roberta.embeddings.LayerNorm.variance_epsilon = roberta_sent_encoder.emb_layer_norm.eps
|
||||
|
||||
for i in range(config.num_hidden_layers):
|
||||
# Encoder: start of layer
|
||||
@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
||||
self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
|
||||
self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
|
||||
self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
|
||||
self_output.LayerNorm.variance_epsilon = roberta_layer.self_attn_layer_norm.eps
|
||||
|
||||
### intermediate
|
||||
intermediate: BertIntermediate = layer.intermediate
|
||||
@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
||||
bert_output.dense.bias = roberta_layer.fc2.bias
|
||||
bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
|
||||
bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
|
||||
bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps
|
||||
#### end of layer
|
||||
|
||||
if classification_head:
|
||||
@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
||||
model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
|
||||
model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
|
||||
model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
|
||||
model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
|
||||
model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
|
||||
model.lm_head.bias = roberta.model.decoder.lm_head.bias
|
||||
|
||||
@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
||||
else:
|
||||
their_output = roberta.model(input_ids)[0]
|
||||
print(our_output.shape, their_output.shape)
|
||||
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||
print(
|
||||
"Do both models output the same tensors?",
|
||||
|
||||
@@ -75,7 +75,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
|
||||
if transfo_xl_config_file == "":
|
||||
config = TransfoXLConfig()
|
||||
else:
|
||||
config = TransfoXLConfig(transfo_xl_config_file)
|
||||
config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
|
||||
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||
model = TransfoXLLMHeadModel(config)
|
||||
|
||||
|
||||
@@ -17,8 +17,9 @@ from hashlib import sha256
|
||||
from io import open
|
||||
|
||||
import boto3
|
||||
import requests
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
try:
|
||||
@@ -93,12 +94,15 @@ def filename_to_url(filename, cache_dir=None):
|
||||
return url, etag
|
||||
|
||||
|
||||
def cached_path(url_or_filename, cache_dir=None):
|
||||
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
|
||||
"""
|
||||
Given something that might be a URL (or might be a local path),
|
||||
determine which. If it's a URL, download the file and cache it, and
|
||||
return the path to the cached file. If it's already a local path,
|
||||
make sure the file exists and then return the path.
|
||||
Args:
|
||||
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
||||
@@ -111,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None):
|
||||
|
||||
if parsed.scheme in ('http', 'https', 's3'):
|
||||
# URL, so get it from the cache (downloading if necessary)
|
||||
return get_from_cache(url_or_filename, cache_dir)
|
||||
return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
elif os.path.exists(url_or_filename):
|
||||
# File, and it exists.
|
||||
return url_or_filename
|
||||
@@ -156,24 +160,24 @@ def s3_request(func):
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_etag(url):
|
||||
def s3_etag(url, proxies=None):
|
||||
"""Check ETag on S3 object."""
|
||||
s3_resource = boto3.resource("s3")
|
||||
s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
s3_object = s3_resource.Object(bucket_name, s3_path)
|
||||
return s3_object.e_tag
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_get(url, temp_file):
|
||||
def s3_get(url, temp_file, proxies=None):
|
||||
"""Pull a file directly from S3."""
|
||||
s3_resource = boto3.resource("s3")
|
||||
s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
||||
|
||||
|
||||
def http_get(url, temp_file):
|
||||
req = requests.get(url, stream=True)
|
||||
def http_get(url, temp_file, proxies=None):
|
||||
req = requests.get(url, stream=True, proxies=proxies)
|
||||
content_length = req.headers.get('Content-Length')
|
||||
total = int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit="B", total=total)
|
||||
@@ -184,7 +188,7 @@ def http_get(url, temp_file):
|
||||
progress.close()
|
||||
|
||||
|
||||
def get_from_cache(url, cache_dir=None):
|
||||
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
|
||||
"""
|
||||
Given a URL, look for the corresponding dataset in the local cache.
|
||||
If it's not there, download it. Then return the path to the cached file.
|
||||
@@ -201,10 +205,10 @@ def get_from_cache(url, cache_dir=None):
|
||||
|
||||
# Get eTag to add to filename, if it exists.
|
||||
if url.startswith("s3://"):
|
||||
etag = s3_etag(url)
|
||||
etag = s3_etag(url, proxies=proxies)
|
||||
else:
|
||||
try:
|
||||
response = requests.head(url, allow_redirects=True)
|
||||
response = requests.head(url, allow_redirects=True, proxies=proxies)
|
||||
if response.status_code != 200:
|
||||
etag = None
|
||||
else:
|
||||
@@ -227,17 +231,17 @@ def get_from_cache(url, cache_dir=None):
|
||||
if matching_files:
|
||||
cache_path = os.path.join(cache_dir, matching_files[-1])
|
||||
|
||||
if not os.path.exists(cache_path):
|
||||
if not os.path.exists(cache_path) or force_download:
|
||||
# Download to temporary file, then copy to cache dir once finished.
|
||||
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
||||
with tempfile.NamedTemporaryFile() as temp_file:
|
||||
logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
|
||||
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
|
||||
|
||||
# GET file object
|
||||
if url.startswith("s3://"):
|
||||
s3_get(url, temp_file)
|
||||
s3_get(url, temp_file, proxies=proxies)
|
||||
else:
|
||||
http_get(url, temp_file)
|
||||
http_get(url, temp_file, proxies=proxies)
|
||||
|
||||
# we are copying the file before closing it, so flush to avoid truncation
|
||||
temp_file.flush()
|
||||
|
||||
@@ -18,22 +18,20 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
|
||||
import logging
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss, MSELoss
|
||||
from torch.nn.parameter import Parameter
|
||||
from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
|
||||
from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
|
||||
from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel
|
||||
from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
|
||||
from .modeling_xlnet import XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
|
||||
from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
||||
from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
||||
from .modeling_distilbert import DistilBertConfig, DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
||||
|
||||
from .modeling_bert import BertConfig, BertModel
|
||||
from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
|
||||
from .modeling_gpt2 import GPT2Config, GPT2Model
|
||||
from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
|
||||
from .modeling_xlnet import XLNetConfig, XLNetModel
|
||||
from .modeling_xlm import XLMConfig, XLMModel
|
||||
|
||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||
from .modeling_utils import PreTrainedModel, SequenceSummary, add_start_docstrings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoConfig(object):
|
||||
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
|
||||
that will be instantiated as one of the configuration classes of the library
|
||||
@@ -45,12 +43,14 @@ class AutoConfig(object):
|
||||
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
"""
|
||||
@@ -65,34 +65,42 @@ class AutoConfig(object):
|
||||
|
||||
The configuration class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a saved configuration `file`.
|
||||
**cache_dir**: (`optional`) string:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
**return_unused_kwargs**: (`optional`) bool:
|
||||
|
||||
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
|
||||
|
||||
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
return_unused_kwargs: (`optional`) bool:
|
||||
|
||||
- If False, then this function returns just the final configuration object.
|
||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
|
||||
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
|
||||
ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||
**kwargs**: (`optional`) dict:
|
||||
Dictionary of key/value pairs with which to update the configuration object after loading.
|
||||
- The values in kwargs of any keys which are configuration attributes will be used
|
||||
to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
|
||||
by the `return_unused_kwargs` keyword parameter.
|
||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -107,7 +115,11 @@ class AutoConfig(object):
|
||||
assert unused_kwargs == {'foo': False}
|
||||
|
||||
"""
|
||||
if 'bert' in pretrained_model_name_or_path:
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
@@ -122,7 +134,7 @@ class AutoConfig(object):
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm'".format(pretrained_model_name_or_path))
|
||||
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModel(object):
|
||||
@@ -132,19 +144,21 @@ class AutoModel(object):
|
||||
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
The `from_pretrained()` method take care of returning the correct model class instance
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||
- contains `bert`: BertModel (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetModel (XLNet model)
|
||||
- contains `xlm`: XLMModel (XLM model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
def __init__(self):
|
||||
raise EnvironmentError("AutoModel is designed to be instantiated "
|
||||
@@ -152,60 +166,64 @@ class AutoModel(object):
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiate a one of the base model classes of the library
|
||||
r""" Instantiates one of the base model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||
- contains `bert`: BertModel (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetModel (XLNet model)
|
||||
- contains `xlm`: XLMModel (XLM model)
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
|
||||
In this case, ``from_tf`` should be set to True and a configuration object should be
|
||||
provided as `config` argument. This loading option is slower than converting the TensorFlow
|
||||
checkpoint in a PyTorch model using the provided conversion scripts and loading
|
||||
the PyTorch model afterwards.
|
||||
**model_args**: (`optional`) Sequence:
|
||||
All remaning positional arguments will be passed to the underlying model's __init__ function
|
||||
**config**: an optional configuration for the model to use instead of an automatically loaded configuation.
|
||||
Configuration can be automatically loaded when:
|
||||
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
|
||||
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
|
||||
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
|
||||
from saved weights file.
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
model_args: (`optional`) Sequence of positional arguments:
|
||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||
|
||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||
|
||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||
|
||||
state_dict: (`optional`) dict:
|
||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
|
||||
a simpler option.
|
||||
**cache_dir**: (`optional`) string:
|
||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
**output_loading_info**: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
**kwargs**: (`optional`) dict:
|
||||
Dictionary of key, values to update the configuration object after loading.
|
||||
Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
|
||||
|
||||
- If a configuration is provided with `config`, **kwargs will be directly passed
|
||||
to the underlying model's __init__ method.
|
||||
- If a configuration is not provided, **kwargs will be first passed to the pretrained
|
||||
model configuration class loading function (`PretrainedConfig.from_pretrained`).
|
||||
Each key of **kwargs that corresponds to a configuration attribute
|
||||
will be used to override said attribute with the supplied **kwargs value.
|
||||
Remaining keys that do not correspond to any configuration attribute will
|
||||
be passed to the underlying model's __init__ function.
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
output_loading_info: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
|
||||
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||
|
||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -218,7 +236,11 @@ class AutoModel(object):
|
||||
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
if 'bert' in pretrained_model_name_or_path:
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
@@ -233,4 +255,346 @@ class AutoModel(object):
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm'".format(pretrained_model_name_or_path))
|
||||
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModelWithLMHead(object):
|
||||
r"""
|
||||
:class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||
- contains `bert`: BertForMaskedLM (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
def __init__(self):
|
||||
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
|
||||
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the language modeling model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||
- contains `bert`: BertForMaskedLM (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
model_args: (`optional`) Sequence of positional arguments:
|
||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||
|
||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||
|
||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||
|
||||
state_dict: (`optional`) dict:
|
||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
output_loading_info: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
|
||||
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||
|
||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||
|
||||
Examples::
|
||||
|
||||
model = AutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = AutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'gpt2' in pretrained_model_name_or_path:
|
||||
return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||
return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlnet' in pretrained_model_name_or_path:
|
||||
return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlm' in pretrained_model_name_or_path:
|
||||
return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModelForSequenceClassification(object):
|
||||
r"""
|
||||
:class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
|
||||
that will be instantiated as one of the sequence classification model classes of the library
|
||||
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||
- contains `xlm`: XLMForSequenceClassification (XLM model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
def __init__(self):
|
||||
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
|
||||
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the sequence classification model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||
- contains `xlm`: XLMForSequenceClassification (XLM model)
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
model_args: (`optional`) Sequence of positional arguments:
|
||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||
|
||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||
|
||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||
|
||||
state_dict: (`optional`) dict:
|
||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
output_loading_info: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
|
||||
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||
|
||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||
|
||||
Examples::
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlnet' in pretrained_model_name_or_path:
|
||||
return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlm' in pretrained_model_name_or_path:
|
||||
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModelForQuestionAnswering(object):
|
||||
r"""
|
||||
:class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
|
||||
that will be instantiated as one of the question answering model classes of the library
|
||||
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
def __init__(self):
|
||||
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
|
||||
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiates one of the question answering model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
model_args: (`optional`) Sequence of positional arguments:
|
||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||
|
||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||
|
||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||
|
||||
state_dict: (`optional`) dict:
|
||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
output_loading_info: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
|
||||
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||
|
||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||
|
||||
Examples::
|
||||
|
||||
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlnet' in pretrained_model_name_or_path:
|
||||
return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlm' in pretrained_model_name_or_path:
|
||||
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
|
||||
|
||||
@@ -216,7 +216,7 @@ class BertConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)")
|
||||
" or the path to a pretrained model config file (str)")
|
||||
|
||||
|
||||
|
||||
@@ -224,20 +224,7 @@ try:
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
||||
except (ImportError, AttributeError) as e:
|
||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
||||
class BertLayerNorm(nn.Module):
|
||||
def __init__(self, hidden_size, eps=1e-12):
|
||||
"""Construct a layernorm module in the TF style (epsilon inside the square root).
|
||||
"""
|
||||
super(BertLayerNorm, self).__init__()
|
||||
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||
self.bias = nn.Parameter(torch.zeros(hidden_size))
|
||||
self.variance_epsilon = eps
|
||||
|
||||
def forward(self, x):
|
||||
u = x.mean(-1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(-1, keepdim=True)
|
||||
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
|
||||
return self.weight * x + self.bias
|
||||
BertLayerNorm = torch.nn.LayerNorm
|
||||
|
||||
class BertEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings.
|
||||
@@ -350,23 +337,30 @@ class BertAttention(nn.Module):
|
||||
super(BertAttention, self).__init__()
|
||||
self.self = BertSelfAttention(config)
|
||||
self.output = BertSelfOutput(config)
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
if len(heads) == 0:
|
||||
return
|
||||
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
||||
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||
for head in heads:
|
||||
# Compute how many pruned heads are before the head and move the index accordingly
|
||||
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||
mask[head] = 0
|
||||
mask = mask.view(-1).contiguous().eq(1)
|
||||
index = torch.arange(len(mask))[mask].long()
|
||||
|
||||
# Prune linear layers
|
||||
self.self.query = prune_linear_layer(self.self.query, index)
|
||||
self.self.key = prune_linear_layer(self.self.key, index)
|
||||
self.self.value = prune_linear_layer(self.self.value, index)
|
||||
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
|
||||
# Update hyper params
|
||||
|
||||
# Update hyper params and store pruned heads
|
||||
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
|
||||
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def forward(self, input_tensor, attention_mask, head_mask=None):
|
||||
self_outputs = self.self(input_tensor, attention_mask, head_mask)
|
||||
@@ -449,7 +443,7 @@ class BertEncoder(nn.Module):
|
||||
outputs = outputs + (all_hidden_states,)
|
||||
if self.output_attentions:
|
||||
outputs = outputs + (all_attentions,)
|
||||
return outputs # outputs, (hidden states), (attentions)
|
||||
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||
|
||||
|
||||
class BertPooler(nn.Module):
|
||||
@@ -544,12 +538,8 @@ class BertPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_bert
|
||||
base_model_prefix = "bert"
|
||||
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super(BertPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
|
||||
def init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
@@ -577,7 +567,9 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
|
||||
https://pytorch.org/docs/stable/nn.html#module
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
BERT_INPUTS_DOCSTRING = r"""
|
||||
@@ -597,7 +589,10 @@ BERT_INPUTS_DOCSTRING = r"""
|
||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0``
|
||||
|
||||
|
||||
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
|
||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
@@ -657,7 +652,7 @@ class BertModel(BertPreTrainedModel):
|
||||
self.encoder = BertEncoder(config)
|
||||
self.pooler = BertPooler(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
old_embeddings = self.embeddings.word_embeddings
|
||||
@@ -766,7 +761,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
self.bert = BertModel(config)
|
||||
self.cls = BertPreTrainingHeads(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
@@ -834,7 +829,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
self.bert = BertModel(config)
|
||||
self.cls = BertOnlyMLMHead(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
@@ -899,7 +894,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
self.bert = BertModel(config)
|
||||
self.cls = BertOnlyNSPHead(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
|
||||
position_ids=None, head_mask=None):
|
||||
@@ -960,7 +955,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
||||
position_ids=None, head_mask=None):
|
||||
@@ -1064,7 +1059,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, 1)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
||||
position_ids=None, head_mask=None):
|
||||
@@ -1132,7 +1127,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
||||
position_ids=None, head_mask=None):
|
||||
@@ -1206,7 +1201,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
self.bert = BertModel(config)
|
||||
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
|
||||
end_positions=None, position_ids=None, head_mask=None):
|
||||
|
||||
756
pytorch_transformers/modeling_distilbert.py
Normal file
756
pytorch_transformers/modeling_distilbert.py
Normal file
@@ -0,0 +1,756 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" PyTorch DistilBERT model
|
||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import copy
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import itertools
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from pytorch_transformers.modeling_utils import PretrainedConfig, PreTrainedModel, add_start_docstrings, prune_linear_layer
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
|
||||
}
|
||||
|
||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
|
||||
}
|
||||
|
||||
|
||||
class DistilBertConfig(PretrainedConfig):
|
||||
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=30522,
|
||||
max_position_embeddings=512,
|
||||
sinusoidal_pos_embds=True,
|
||||
n_layers=6,
|
||||
n_heads=12,
|
||||
dim=768,
|
||||
hidden_dim=4*768,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
activation='gelu',
|
||||
initializer_range=0.02,
|
||||
tie_weights_=True,
|
||||
qa_dropout=0.1,
|
||||
seq_classif_dropout=0.2,
|
||||
**kwargs):
|
||||
super(DistilBertConfig, self).__init__(**kwargs)
|
||||
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||
self.n_layers = n_layers
|
||||
self.n_heads = n_heads
|
||||
self.dim = dim
|
||||
self.hidden_dim = hidden_dim
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation = activation
|
||||
self.initializer_range = initializer_range
|
||||
self.tie_weights_ = tie_weights_
|
||||
self.qa_dropout = qa_dropout
|
||||
self.seq_classif_dropout = seq_classif_dropout
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
" or the path to a pretrained model config file (str)")
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.dim
|
||||
|
||||
@property
|
||||
def num_attention_heads(self):
|
||||
return self.n_heads
|
||||
|
||||
@property
|
||||
def num_hidden_layers(self):
|
||||
return self.n_layers
|
||||
|
||||
|
||||
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
|
||||
def gelu(x):
|
||||
return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||
|
||||
def create_sinusoidal_embeddings(n_pos, dim, out):
|
||||
position_enc = np.array([
|
||||
[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
|
||||
for pos in range(n_pos)
|
||||
])
|
||||
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
|
||||
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
|
||||
out.detach_()
|
||||
out.requires_grad = False
|
||||
|
||||
class Embeddings(nn.Module):
|
||||
def __init__(self,
|
||||
config):
|
||||
super(Embeddings, self).__init__()
|
||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
|
||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
|
||||
if config.sinusoidal_pos_embds:
|
||||
create_sinusoidal_embeddings(n_pos=config.max_position_embeddings,
|
||||
dim=config.dim,
|
||||
out=self.position_embeddings.weight)
|
||||
|
||||
self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
|
||||
self.dropout = nn.Dropout(config.dropout)
|
||||
|
||||
def forward(self, input_ids):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
input_ids: torch.tensor(bs, max_seq_length)
|
||||
The token ids to embed.
|
||||
|
||||
Outputs
|
||||
-------
|
||||
embeddings: torch.tensor(bs, max_seq_length, dim)
|
||||
The embedded tokens (plus position embeddings, no token_type embeddings)
|
||||
"""
|
||||
seq_length = input_ids.size(1)
|
||||
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
|
||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length)
|
||||
|
||||
word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim)
|
||||
position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim)
|
||||
|
||||
embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim)
|
||||
embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim)
|
||||
embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim)
|
||||
return embeddings
|
||||
|
||||
class MultiHeadSelfAttention(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(MultiHeadSelfAttention, self).__init__()
|
||||
|
||||
self.n_heads = config.n_heads
|
||||
self.dim = config.dim
|
||||
self.dropout = nn.Dropout(p=config.attention_dropout)
|
||||
self.output_attentions = config.output_attentions
|
||||
|
||||
assert self.dim % self.n_heads == 0
|
||||
|
||||
self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
||||
self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
||||
self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
||||
self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
||||
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
attention_head_size = self.dim // self.n_heads
|
||||
if len(heads) == 0:
|
||||
return
|
||||
mask = torch.ones(self.n_heads, attention_head_size)
|
||||
heads = set(heads) - self.pruned_heads
|
||||
for head in heads:
|
||||
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||
mask[head] = 0
|
||||
mask = mask.view(-1).contiguous().eq(1)
|
||||
index = torch.arange(len(mask))[mask].long()
|
||||
# Prune linear layers
|
||||
self.q_lin = prune_linear_layer(self.q_lin, index)
|
||||
self.k_lin = prune_linear_layer(self.k_lin, index)
|
||||
self.v_lin = prune_linear_layer(self.v_lin, index)
|
||||
self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
|
||||
# Update hyper params
|
||||
self.n_heads = self.n_heads - len(heads)
|
||||
self.dim = attention_head_size * self.n_heads
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def forward(self, query, key, value, mask, head_mask = None):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
query: torch.tensor(bs, seq_length, dim)
|
||||
key: torch.tensor(bs, seq_length, dim)
|
||||
value: torch.tensor(bs, seq_length, dim)
|
||||
mask: torch.tensor(bs, seq_length)
|
||||
|
||||
Outputs
|
||||
-------
|
||||
weights: torch.tensor(bs, n_heads, seq_length, seq_length)
|
||||
Attention weights
|
||||
context: torch.tensor(bs, seq_length, dim)
|
||||
Contextualized layer. Optional: only if `output_attentions=True`
|
||||
"""
|
||||
bs, q_length, dim = query.size()
|
||||
k_length = key.size(1)
|
||||
# assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
|
||||
# assert key.size() == value.size()
|
||||
|
||||
dim_per_head = self.dim // self.n_heads
|
||||
|
||||
assert 2 <= mask.dim() <= 3
|
||||
causal = (mask.dim() == 3)
|
||||
mask_reshp = (bs, 1, 1, k_length)
|
||||
|
||||
def shape(x):
|
||||
""" separate heads """
|
||||
return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
|
||||
|
||||
def unshape(x):
|
||||
""" group heads """
|
||||
return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
|
||||
|
||||
q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head)
|
||||
k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head)
|
||||
v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head)
|
||||
|
||||
q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
|
||||
scores = torch.matmul(q, k.transpose(2,3)) # (bs, n_heads, q_length, k_length)
|
||||
mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
|
||||
scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length)
|
||||
|
||||
weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length)
|
||||
weights = self.dropout(weights) # (bs, n_heads, q_length, k_length)
|
||||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
weights = weights * head_mask
|
||||
|
||||
context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head)
|
||||
context = unshape(context) # (bs, q_length, dim)
|
||||
context = self.out_lin(context) # (bs, q_length, dim)
|
||||
|
||||
if self.output_attentions:
|
||||
return (context, weights)
|
||||
else:
|
||||
return (context,)
|
||||
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(FFN, self).__init__()
|
||||
self.dropout = nn.Dropout(p=config.dropout)
|
||||
self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
|
||||
self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
|
||||
assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
|
||||
self.activation = gelu if config.activation == 'gelu' else nn.ReLU()
|
||||
|
||||
def forward(self, input):
|
||||
x = self.lin1(input)
|
||||
x = self.activation(x)
|
||||
x = self.lin2(x)
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(TransformerBlock, self).__init__()
|
||||
|
||||
self.n_heads = config.n_heads
|
||||
self.dim = config.dim
|
||||
self.hidden_dim = config.hidden_dim
|
||||
self.dropout = nn.Dropout(p=config.dropout)
|
||||
self.activation = config.activation
|
||||
self.output_attentions = config.output_attentions
|
||||
|
||||
assert config.dim % config.n_heads == 0
|
||||
|
||||
self.attention = MultiHeadSelfAttention(config)
|
||||
self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
|
||||
|
||||
self.ffn = FFN(config)
|
||||
self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
|
||||
|
||||
def forward(self, x, attn_mask=None, head_mask=None):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
x: torch.tensor(bs, seq_length, dim)
|
||||
attn_mask: torch.tensor(bs, seq_length)
|
||||
|
||||
Outputs
|
||||
-------
|
||||
sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length)
|
||||
The attention weights
|
||||
ffn_output: torch.tensor(bs, seq_length, dim)
|
||||
The output of the transformer block contextualization.
|
||||
"""
|
||||
# Self-Attention
|
||||
sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask)
|
||||
if self.output_attentions:
|
||||
sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
|
||||
else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
|
||||
assert type(sa_output) == tuple
|
||||
sa_output = sa_output[0]
|
||||
sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim)
|
||||
|
||||
# Feed Forward Network
|
||||
ffn_output = self.ffn(sa_output) # (bs, seq_length, dim)
|
||||
ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim)
|
||||
|
||||
output = (ffn_output,)
|
||||
if self.output_attentions:
|
||||
output = (sa_weights,) + output
|
||||
return output
|
||||
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(Transformer, self).__init__()
|
||||
self.n_layers = config.n_layers
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
|
||||
layer = TransformerBlock(config)
|
||||
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])
|
||||
|
||||
def forward(self, x, attn_mask=None, head_mask=None):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
x: torch.tensor(bs, seq_length, dim)
|
||||
Input sequence embedded.
|
||||
attn_mask: torch.tensor(bs, seq_length)
|
||||
Attention mask on the sequence.
|
||||
|
||||
Outputs
|
||||
-------
|
||||
hidden_state: torch.tensor(bs, seq_length, dim)
|
||||
Sequence of hiddens states in the last (top) layer
|
||||
all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
|
||||
Tuple of length n_layers with the hidden states from each layer.
|
||||
Optional: only if output_hidden_states=True
|
||||
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
|
||||
Tuple of length n_layers with the attention weights from each layer
|
||||
Optional: only if output_attentions=True
|
||||
"""
|
||||
all_hidden_states = ()
|
||||
all_attentions = ()
|
||||
|
||||
hidden_state = x
|
||||
for i, layer_module in enumerate(self.layer):
|
||||
if self.output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||
|
||||
layer_outputs = layer_module(x=hidden_state,
|
||||
attn_mask=attn_mask,
|
||||
head_mask=head_mask[i])
|
||||
hidden_state = layer_outputs[-1]
|
||||
|
||||
if self.output_attentions:
|
||||
assert len(layer_outputs) == 2
|
||||
attentions = layer_outputs[0]
|
||||
all_attentions = all_attentions + (attentions,)
|
||||
else:
|
||||
assert len(layer_outputs) == 1
|
||||
|
||||
# Add last layer
|
||||
if self.output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_state,)
|
||||
|
||||
outputs = (hidden_state,)
|
||||
if self.output_hidden_states:
|
||||
outputs = outputs + (all_hidden_states,)
|
||||
if self.output_attentions:
|
||||
outputs = outputs + (all_attentions,)
|
||||
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||
|
||||
|
||||
### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
|
||||
class DistilBertPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
config_class = DistilBertConfig
|
||||
pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "distilbert"
|
||||
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, nn.Embedding):
|
||||
if module.weight.requires_grad:
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if isinstance(module, nn.Linear):
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
|
||||
|
||||
DISTILBERT_START_DOCSTRING = r"""
|
||||
DistilBERT is a small, fast, cheap and light Transformer model
|
||||
trained by distilling Bert base. It has 40% less parameters than
|
||||
`bert-base-uncased`, runs 60% faster while preserving over 95% of
|
||||
Bert's performances as measured on the GLUE language understanding benchmark.
|
||||
|
||||
Here are the differences between the interface of Bert and DistilBert:
|
||||
|
||||
- DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
|
||||
- DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
|
||||
|
||||
For more information on DistilBERT, please refer to our
|
||||
`detailed blog post`_
|
||||
|
||||
.. _`detailed blog post`:
|
||||
https://medium.com/huggingface/distilbert-8cf3380435b5
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
DISTILBERT_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids** ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
|
||||
|
||||
For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
|
||||
**attention_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
"""
|
||||
|
||||
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class DistilBertModel(DistilBertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(DistilBertModel, self).__init__(config)
|
||||
|
||||
self.embeddings = Embeddings(config) # Embeddings
|
||||
self.transformer = Transformer(config) # Encoder
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
old_embeddings = self.embeddings.word_embeddings
|
||||
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||
self.embeddings.word_embeddings = new_embeddings
|
||||
return self.embeddings.word_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.transformer.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
def forward(self,
|
||||
input_ids, attention_mask=None, head_mask=None):
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones_like(input_ids) # (bs, seq_length)
|
||||
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
# attention_probs has shape bsz x n_heads x N x N
|
||||
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||
if head_mask is not None:
|
||||
if head_mask.dim() == 1:
|
||||
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
|
||||
elif head_mask.dim() == 2:
|
||||
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
|
||||
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
|
||||
else:
|
||||
head_mask = [None] * self.config.num_hidden_layers
|
||||
|
||||
embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim)
|
||||
tfmr_output = self.transformer(x=embedding_output,
|
||||
attn_mask=attention_mask,
|
||||
head_mask=head_mask)
|
||||
hidden_state = tfmr_output[0]
|
||||
output = (hidden_state, ) + tfmr_output[1:]
|
||||
|
||||
return output # last-layer hidden-state, (all hidden_states), (all attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
r"""
|
||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(DistilBertForMaskedLM, self).__init__(config)
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
|
||||
self.distilbert = DistilBertModel(config)
|
||||
self.vocab_transform = nn.Linear(config.dim, config.dim)
|
||||
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
|
||||
self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
|
||||
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
||||
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
self._tie_or_clone_weights(self.vocab_projector,
|
||||
self.distilbert.embeddings.word_embeddings)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None):
|
||||
dlbrt_output = self.distilbert(input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask)
|
||||
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
||||
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
||||
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
|
||||
prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
|
||||
prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size)
|
||||
|
||||
outputs = (prediction_logits, ) + dlbrt_output[1:]
|
||||
if masked_lm_labels is not None:
|
||||
mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)),
|
||||
masked_lm_labels.view(-1))
|
||||
outputs = (mlm_loss,) + outputs
|
||||
|
||||
return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||
the pooled output) e.g. for GLUE tasks. """,
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
||||
r"""
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(DistilBertForSequenceClassification, self).__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.distilbert = DistilBertModel(config)
|
||||
self.pre_classifier = nn.Linear(config.dim, config.dim)
|
||||
self.classifier = nn.Linear(config.dim, config.num_labels)
|
||||
self.dropout = nn.Dropout(config.seq_classif_dropout)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, labels=None, head_mask=None):
|
||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask)
|
||||
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
||||
pooled_output = hidden_state[:, 0] # (bs, dim)
|
||||
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
||||
pooled_output = nn.ReLU()(pooled_output) # (bs, dim)
|
||||
pooled_output = self.dropout(pooled_output) # (bs, dim)
|
||||
logits = self.classifier(pooled_output) # (bs, dim)
|
||||
|
||||
outputs = (logits,) + distilbert_output[1:]
|
||||
if labels is not None:
|
||||
if self.num_labels == 1:
|
||||
loss_fct = nn.MSELoss()
|
||||
loss = loss_fct(logits.view(-1), labels.view(-1))
|
||||
else:
|
||||
loss_fct = nn.CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
return outputs # (loss), logits, (hidden_states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
||||
r"""
|
||||
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||
Span-start scores (before SoftMax).
|
||||
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||
Span-end scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
||||
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
start_positions = torch.tensor([1])
|
||||
end_positions = torch.tensor([3])
|
||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
loss, start_scores, end_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(DistilBertForQuestionAnswering, self).__init__(config)
|
||||
|
||||
self.distilbert = DistilBertModel(config)
|
||||
self.qa_outputs = nn.Linear(config.dim, config.num_labels)
|
||||
assert config.num_labels == 2
|
||||
self.dropout = nn.Dropout(config.qa_dropout)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
|
||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask)
|
||||
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
||||
|
||||
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
|
||||
logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2)
|
||||
start_logits, end_logits = logits.split(1, dim=-1)
|
||||
start_logits = start_logits.squeeze(-1) # (bs, max_query_len)
|
||||
end_logits = end_logits.squeeze(-1) # (bs, max_query_len)
|
||||
|
||||
outputs = (start_logits, end_logits,) + distilbert_output[1:]
|
||||
if start_positions is not None and end_positions is not None:
|
||||
# If we are on multi-GPU, split add a dimension
|
||||
if len(start_positions.size()) > 1:
|
||||
start_positions = start_positions.squeeze(-1)
|
||||
if len(end_positions.size()) > 1:
|
||||
end_positions = end_positions.squeeze(-1)
|
||||
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
||||
ignored_index = start_logits.size(1)
|
||||
start_positions.clamp_(0, ignored_index)
|
||||
end_positions.clamp_(0, ignored_index)
|
||||
|
||||
loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
|
||||
start_loss = loss_fct(start_logits, start_positions)
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
outputs = (total_loss,) + outputs
|
||||
|
||||
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
|
||||
@@ -38,9 +38,11 @@ from .modeling_bert import BertLayerNorm as LayerNorm
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
|
||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
|
||||
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
|
||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
|
||||
|
||||
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model
|
||||
@@ -231,22 +233,29 @@ class Attention(nn.Module):
|
||||
self.c_proj = Conv1D(n_state, nx)
|
||||
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
||||
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
if len(heads) == 0:
|
||||
return
|
||||
mask = torch.ones(self.n_head, self.split_size // self.n_head)
|
||||
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||
for head in heads:
|
||||
# Compute how many pruned heads are before the head and move the index accordingly
|
||||
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||
mask[head] = 0
|
||||
mask = mask.view(-1).contiguous().eq(1)
|
||||
index = torch.arange(len(mask))[mask].long()
|
||||
index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
|
||||
|
||||
# Prune conv1d layers
|
||||
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
|
||||
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
|
||||
|
||||
# Update hyper params
|
||||
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
|
||||
self.n_head = self.n_head - len(heads)
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def _attn(self, q, k, v, head_mask=None):
|
||||
w = torch.matmul(q, k)
|
||||
@@ -352,7 +361,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
|
||||
def init_weights(self, module):
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
||||
@@ -383,11 +392,15 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
@@ -402,10 +415,6 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
(see `past` output below). Can be used to speed up sequential decoding.
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
@@ -451,7 +460,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||
self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
|
||||
@@ -566,8 +575,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
import torch
|
||||
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
loss, logits = outputs[:2]
|
||||
@@ -578,7 +591,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
self.transformer = GPT2Model(config)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
@@ -612,7 +625,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
|
||||
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||
The language modeling head has its weights tied to the input embeddings,
|
||||
the classification head takes as input the input of a specified classification token index in the intput sequence).
|
||||
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||
""", GPT2_START_DOCSTRING)
|
||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
r""" Inputs:
|
||||
@@ -636,10 +649,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
(see `past` output below). Can be used to speed up sequential decoding.
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
@@ -650,14 +659,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
**multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
||||
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
|
||||
`multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
|
||||
with indices selected in [0, ..., num_choices].
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Language modeling loss.
|
||||
@@ -681,13 +687,25 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
import torch
|
||||
from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
||||
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
||||
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
|
||||
|
||||
# Add a [CLS] to the vocabulary (we should train it also!)
|
||||
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
|
||||
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
|
||||
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
|
||||
|
||||
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, mc_token_ids)
|
||||
encoded_choices = [tokenizer.encode(s) for s in choices]
|
||||
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
|
||||
|
||||
input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
|
||||
mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
|
||||
|
||||
outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
@@ -697,7 +715,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
self.multiple_choice_head = SequenceSummary(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
|
||||
@@ -249,12 +249,15 @@ class Attention(nn.Module):
|
||||
self.c_proj = Conv1D(n_state, nx)
|
||||
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
||||
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
if len(heads) == 0:
|
||||
return
|
||||
mask = torch.ones(self.n_head, self.split_size // self.n_head)
|
||||
heads = set(heads) - self.pruned_heads
|
||||
for head in heads:
|
||||
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||
mask[head] = 0
|
||||
mask = mask.view(-1).contiguous().eq(1)
|
||||
index = torch.arange(len(mask))[mask].long()
|
||||
@@ -265,6 +268,7 @@ class Attention(nn.Module):
|
||||
# Update hyper params
|
||||
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
|
||||
self.n_head = self.n_head - len(heads)
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def _attn(self, q, k, v, head_mask=None):
|
||||
w = torch.matmul(q, k)
|
||||
@@ -363,10 +367,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_openai_gpt
|
||||
base_model_prefix = "transformer"
|
||||
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
|
||||
def init_weights(self, module):
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
||||
@@ -397,11 +398,15 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
@@ -411,11 +416,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
@@ -456,7 +457,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
|
||||
@@ -569,7 +570,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
self.transformer = OpenAIGPTModel(config)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
@@ -602,7 +603,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
|
||||
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||
The language modeling head has its weights tied to the input embeddings,
|
||||
the classification head takes as input the input of a specified classification token index in the intput sequence).
|
||||
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||
""", OPENAI_GPT_START_DOCSTRING)
|
||||
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
r""" Inputs:
|
||||
@@ -622,10 +623,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
@@ -636,7 +633,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
**multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
||||
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
@@ -680,7 +677,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
self.multiple_choice_head = SequenceSummary(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
|
||||
@@ -90,25 +90,30 @@ ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the
|
||||
model.
|
||||
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
ROBERTA_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
To match pre-training, RoBERTa input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||
To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
|
||||
|
||||
(a) For sequence pairs:
|
||||
|
||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP][SEP] no it is not . [SEP]``
|
||||
``tokens: <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
|
||||
|
||||
(b) For single sequences:
|
||||
|
||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||
``tokens: <s> the dog is hairy . </s>``
|
||||
|
||||
Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
|
||||
the ``add_special_tokens`` parameter set to ``True``.
|
||||
|
||||
RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
@@ -163,7 +168,7 @@ class RobertaModel(BertModel):
|
||||
super(RobertaModel, self).__init__(config)
|
||||
|
||||
self.embeddings = RobertaEmbeddings(config)
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
|
||||
if input_ids[:, 0].sum().item() != 0:
|
||||
@@ -215,7 +220,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
self.roberta = RobertaModel(config)
|
||||
self.lm_head = RobertaLMHead(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
|
||||
@@ -285,7 +285,7 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
self.init_std = init_std
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)")
|
||||
" or the path to a pretrained model config file (str)")
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
@@ -853,9 +853,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_transfo_xl
|
||||
base_model_prefix = "transformer"
|
||||
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super(TransfoXLPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
|
||||
def _init_weight(self, weight):
|
||||
if self.config.init == 'uniform':
|
||||
nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
|
||||
@@ -865,7 +862,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
|
||||
def _init_bias(self, bias):
|
||||
nn.init.constant_(bias, 0.0)
|
||||
|
||||
def init_weights(self, m):
|
||||
def _init_weights(self, m):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
classname = m.__class__.__name__
|
||||
@@ -928,12 +925,16 @@ TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
TRANSFO_XL_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
|
||||
the right or on the left.
|
||||
Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
@@ -1055,7 +1056,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
self.r_emb = nn.Parameter(torch.FloatTensor(
|
||||
self.n_layer, self.max_klen, self.n_head, self.d_head))
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
return self.word_emb
|
||||
@@ -1138,10 +1139,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
else:
|
||||
mask_shift_len = qlen
|
||||
dec_attn_mask = (torch.triu(all_ones, 1+mlen)
|
||||
+ torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1
|
||||
+ torch.tril(all_ones, -mask_shift_len)).bool()[:, :, None] # -1
|
||||
else:
|
||||
dec_attn_mask = torch.triu(
|
||||
word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]
|
||||
word_emb.new_ones(qlen, klen), diagonal=1+mlen).bool()[:,:,None]
|
||||
|
||||
hids = []
|
||||
attentions = []
|
||||
@@ -1302,7 +1303,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||
else:
|
||||
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
|
||||
config.cutoffs, div_val=config.div_val)
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
|
||||
@@ -59,6 +59,12 @@ if not six.PY2:
|
||||
fn.__doc__ = ''.join(docstr) + fn.__doc__
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
def add_end_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + ''.join(docstr)
|
||||
return fn
|
||||
return docstring_decorator
|
||||
else:
|
||||
# Not possible to update class docstrings on python2
|
||||
def add_start_docstrings(*docstr):
|
||||
@@ -66,11 +72,20 @@ else:
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
def add_end_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
class PretrainedConfig(object):
|
||||
r""" Base class for all configuration classes.
|
||||
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
|
||||
|
||||
Note:
|
||||
A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
|
||||
It only affects the model's configuration.
|
||||
|
||||
Class attributes (overridden by derived classes):
|
||||
- ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
|
||||
|
||||
@@ -89,6 +104,7 @@ class PretrainedConfig(object):
|
||||
self.output_attentions = kwargs.pop('output_attentions', False)
|
||||
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
||||
self.torchscript = kwargs.pop('torchscript', False)
|
||||
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save a configuration object to the directory `save_directory`, so that it
|
||||
@@ -121,6 +137,13 @@ class PretrainedConfig(object):
|
||||
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
return_unused_kwargs: (`optional`) bool:
|
||||
|
||||
- If False, then this function returns just the final configuration object.
|
||||
@@ -142,6 +165,8 @@ class PretrainedConfig(object):
|
||||
|
||||
"""
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
force_download = kwargs.pop('force_download', False)
|
||||
proxies = kwargs.pop('proxies', None)
|
||||
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
||||
|
||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||
@@ -152,8 +177,8 @@ class PretrainedConfig(object):
|
||||
config_file = pretrained_model_name_or_path
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
except EnvironmentError as e:
|
||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||
logger.error(
|
||||
"Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
||||
@@ -166,7 +191,7 @@ class PretrainedConfig(object):
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(cls.pretrained_config_archive_map.keys()),
|
||||
config_file))
|
||||
return None
|
||||
raise e
|
||||
if resolved_config_file == config_file:
|
||||
logger.info("loading configuration file {}".format(config_file))
|
||||
else:
|
||||
@@ -176,6 +201,9 @@ class PretrainedConfig(object):
|
||||
# Load config
|
||||
config = cls.from_json_file(resolved_config_file)
|
||||
|
||||
if hasattr(config, 'pruned_heads'):
|
||||
config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items())
|
||||
|
||||
# Update config with kwargs if needed
|
||||
to_remove = []
|
||||
for key, value in kwargs.items():
|
||||
@@ -287,7 +315,7 @@ class PreTrainedModel(nn.Module):
|
||||
new_embeddings.to(old_embeddings.weight.device)
|
||||
|
||||
# initialize all new embeddings (in particular added tokens)
|
||||
self.init_weights(new_embeddings)
|
||||
self._init_weights(new_embeddings)
|
||||
|
||||
# Copy word embeddings from the previous weights
|
||||
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
||||
@@ -303,6 +331,14 @@ class PreTrainedModel(nn.Module):
|
||||
else:
|
||||
first_module.weight = second_module.weight
|
||||
|
||||
if hasattr(first_module, 'bias') and first_module.bias is not None:
|
||||
first_module.bias.data = torch.nn.functional.pad(
|
||||
first_module.bias.data,
|
||||
(0, first_module.weight.shape[0] - first_module.bias.shape[0]),
|
||||
'constant',
|
||||
0
|
||||
)
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens=None):
|
||||
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
|
||||
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
||||
@@ -331,14 +367,30 @@ class PreTrainedModel(nn.Module):
|
||||
|
||||
return model_embeds
|
||||
|
||||
def init_weights(self):
|
||||
""" Initialize and prunes weights if needed. """
|
||||
# Initialize weights
|
||||
self.apply(self._init_weights)
|
||||
|
||||
# Prune heads if needed
|
||||
if self.config.pruned_heads:
|
||||
self.prune_heads(self.config.pruned_heads)
|
||||
|
||||
def prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the base model.
|
||||
|
||||
Arguments:
|
||||
|
||||
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
|
||||
E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
|
||||
"""
|
||||
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||
|
||||
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
|
||||
for layer, heads in heads_to_prune.items():
|
||||
union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
|
||||
self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON
|
||||
|
||||
base_model._prune_heads(heads_to_prune)
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
@@ -396,6 +448,13 @@ class PreTrainedModel(nn.Module):
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
output_loading_info: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
|
||||
@@ -420,6 +479,8 @@ class PreTrainedModel(nn.Module):
|
||||
state_dict = kwargs.pop('state_dict', None)
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
from_tf = kwargs.pop('from_tf', False)
|
||||
force_download = kwargs.pop('force_download', False)
|
||||
proxies = kwargs.pop('proxies', None)
|
||||
output_loading_info = kwargs.pop('output_loading_info', False)
|
||||
|
||||
# Load config
|
||||
@@ -427,6 +488,7 @@ class PreTrainedModel(nn.Module):
|
||||
config, model_kwargs = cls.config_class.from_pretrained(
|
||||
pretrained_model_name_or_path, *model_args,
|
||||
cache_dir=cache_dir, return_unused_kwargs=True,
|
||||
force_download=force_download,
|
||||
**kwargs
|
||||
)
|
||||
else:
|
||||
@@ -449,8 +511,8 @@ class PreTrainedModel(nn.Module):
|
||||
archive_file = pretrained_model_name_or_path
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
except EnvironmentError as e:
|
||||
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
||||
logger.error(
|
||||
"Couldn't reach server at '{}' to download pretrained weights.".format(
|
||||
@@ -463,7 +525,7 @@ class PreTrainedModel(nn.Module):
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(cls.pretrained_model_archive_map.keys()),
|
||||
archive_file))
|
||||
return None
|
||||
raise e
|
||||
if resolved_archive_file == archive_file:
|
||||
logger.info("loading weights file {}".format(archive_file))
|
||||
else:
|
||||
|
||||
@@ -44,6 +44,8 @@ XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
|
||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
|
||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
|
||||
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
|
||||
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
|
||||
}
|
||||
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
|
||||
@@ -54,6 +56,8 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
|
||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
|
||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
|
||||
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
|
||||
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
|
||||
}
|
||||
|
||||
|
||||
@@ -114,6 +118,7 @@ class XLMConfig(PretrainedConfig):
|
||||
causal=False,
|
||||
asm=False,
|
||||
n_langs=1,
|
||||
use_lang_emb=True,
|
||||
max_position_embeddings=512,
|
||||
embed_init_std=2048 ** -0.5,
|
||||
layer_norm_eps=1e-12,
|
||||
@@ -157,6 +162,7 @@ class XLMConfig(PretrainedConfig):
|
||||
self.causal = causal
|
||||
self.asm = asm
|
||||
self.n_langs = n_langs
|
||||
self.use_lang_emb = use_lang_emb
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.bos_index = bos_index
|
||||
self.eos_index = eos_index
|
||||
@@ -178,7 +184,7 @@ class XLMConfig(PretrainedConfig):
|
||||
self.end_n_top = end_n_top
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)")
|
||||
" or the path to a pretrained model config file (str)")
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
@@ -265,13 +271,16 @@ class MultiHeadAttention(nn.Module):
|
||||
self.k_lin = nn.Linear(dim, dim)
|
||||
self.v_lin = nn.Linear(dim, dim)
|
||||
self.out_lin = nn.Linear(dim, dim)
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
attention_head_size = self.dim // self.n_heads
|
||||
if len(heads) == 0:
|
||||
return
|
||||
mask = torch.ones(self.n_heads, attention_head_size)
|
||||
heads = set(heads) - self.pruned_heads
|
||||
for head in heads:
|
||||
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||
mask[head] = 0
|
||||
mask = mask.view(-1).contiguous().eq(1)
|
||||
index = torch.arange(len(mask))[mask].long()
|
||||
@@ -283,6 +292,7 @@ class MultiHeadAttention(nn.Module):
|
||||
# Update hyper params
|
||||
self.n_heads = self.n_heads - len(heads)
|
||||
self.dim = attention_head_size * self.n_heads
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def forward(self, input, mask, kv=None, cache=None, head_mask=None):
|
||||
"""
|
||||
@@ -377,7 +387,7 @@ class XLMPreTrainedModel(PreTrainedModel):
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
|
||||
def init_weights(self, module):
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights. """
|
||||
if isinstance(module, nn.Embedding):
|
||||
if self.config is not None and self.config.embed_init_std is not None:
|
||||
@@ -416,12 +426,18 @@ XLM_START_DOCSTRING = r""" The XLM model was proposed in
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
XLM_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
|
||||
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
@@ -434,8 +450,10 @@ XLM_INPUTS_DOCSTRING = r"""
|
||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
A parallel sequence of tokens to be used to indicate the language of each token in the input.
|
||||
Indices are selected in the pre-trained language vocabulary,
|
||||
i.e. in the range ``[0, config.n_langs - 1[``.
|
||||
Indices are languages ids which can be obtained from the language names by using two conversion mappings
|
||||
provided in the configuration of the model (only provided for multilingual models).
|
||||
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
|
||||
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
@@ -480,7 +498,7 @@ class XLMModel(XLMPreTrainedModel):
|
||||
|
||||
"""
|
||||
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
|
||||
'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads',
|
||||
'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads',
|
||||
'hidden_dim', 'dropout', 'attention_dropout', 'asm',
|
||||
'asm_cutoffs', 'asm_div_value']
|
||||
|
||||
@@ -499,6 +517,7 @@ class XLMModel(XLMPreTrainedModel):
|
||||
|
||||
# dictionary / languages
|
||||
self.n_langs = config.n_langs
|
||||
self.use_lang_emb = config.use_lang_emb
|
||||
self.n_words = config.n_words
|
||||
self.eos_index = config.eos_index
|
||||
self.pad_index = config.pad_index
|
||||
@@ -521,7 +540,7 @@ class XLMModel(XLMPreTrainedModel):
|
||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
|
||||
if config.sinusoidal_embeddings:
|
||||
create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
|
||||
if config.n_langs > 1:
|
||||
if config.n_langs > 1 and config.use_lang_emb:
|
||||
self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
|
||||
self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
|
||||
self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
|
||||
@@ -544,7 +563,14 @@ class XLMModel(XLMPreTrainedModel):
|
||||
self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
|
||||
self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
|
||||
|
||||
self.apply(self.init_weights)
|
||||
if hasattr(config, "pruned_heads"):
|
||||
pruned_heads = config.pruned_heads.copy().items()
|
||||
config.pruned_heads = {}
|
||||
for layer, heads in pruned_heads:
|
||||
if self.attentions[int(layer)].n_heads == config.n_heads:
|
||||
self.prune_heads({int(layer): list(map(int, heads))})
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
|
||||
@@ -620,7 +646,7 @@ class XLMModel(XLMPreTrainedModel):
|
||||
# embeddings
|
||||
tensor = self.embeddings(input_ids)
|
||||
tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
|
||||
if langs is not None:
|
||||
if langs is not None and self.use_lang_emb:
|
||||
tensor = tensor + self.lang_embeddings(langs)
|
||||
if token_type_ids is not None:
|
||||
tensor = tensor + self.embeddings(token_type_ids)
|
||||
@@ -756,7 +782,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
||||
self.transformer = XLMModel(config)
|
||||
self.pred_layer = XLMPredLayer(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
@@ -818,7 +844,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
||||
self.transformer = XLMModel(config)
|
||||
self.sequence_summary = SequenceSummary(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
||||
attention_mask=None, cache=None, labels=None, head_mask=None):
|
||||
@@ -896,7 +922,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
||||
self.transformer = XLMModel(config)
|
||||
self.qa_outputs = SQuADHead(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
||||
attention_mask=None, cache=None, start_positions=None, end_positions=None,
|
||||
|
||||
@@ -306,7 +306,7 @@ class XLNetConfig(PretrainedConfig):
|
||||
self.end_n_top = end_n_top
|
||||
else:
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)")
|
||||
" or the path to a pretrained model config file (str)")
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
@@ -337,20 +337,7 @@ try:
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
|
||||
except (ImportError, AttributeError) as e:
|
||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
||||
class XLNetLayerNorm(nn.Module):
|
||||
def __init__(self, d_model, eps=1e-12):
|
||||
"""Construct a layernorm module in the TF style (epsilon inside the square root).
|
||||
"""
|
||||
super(XLNetLayerNorm, self).__init__()
|
||||
self.weight = nn.Parameter(torch.ones(d_model))
|
||||
self.bias = nn.Parameter(torch.zeros(d_model))
|
||||
self.variance_epsilon = eps
|
||||
|
||||
def forward(self, x):
|
||||
u = x.mean(-1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(-1, keepdim=True)
|
||||
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
|
||||
return self.weight * x + self.bias
|
||||
from torch.nn import LayerNorm as XLNetLayerNorm
|
||||
|
||||
class XLNetRelativeAttention(nn.Module):
|
||||
def __init__(self, config):
|
||||
@@ -418,7 +405,10 @@ class XLNetRelativeAttention(nn.Module):
|
||||
attn_score = (ac + bd + ef) * self.scale
|
||||
if attn_mask is not None:
|
||||
# attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
|
||||
attn_score = attn_score - 1e30 * attn_mask
|
||||
if attn_mask.dtype == torch.float16:
|
||||
attn_score = attn_score - 65500 * attn_mask
|
||||
else:
|
||||
attn_score = attn_score - 1e30 * attn_mask
|
||||
|
||||
# attention probability
|
||||
attn_prob = F.softmax(attn_score, dim=1)
|
||||
@@ -596,10 +586,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
|
||||
load_tf_weights = load_tf_weights_in_xlnet
|
||||
base_model_prefix = "transformer"
|
||||
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super(XLNetPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
|
||||
def init_weights(self, module):
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
@@ -647,12 +634,16 @@ XLNET_START_DOCSTRING = r""" The XLNet model was proposed in
|
||||
|
||||
Parameters:
|
||||
config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
XLNET_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
XLNet is a model with relative position embeddings so you can either pad the inputs on
|
||||
the right or on the left.
|
||||
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
@@ -673,8 +664,11 @@ XLNET_INPUTS_DOCSTRING = r"""
|
||||
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
|
||||
**mems**: (`optional`)
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
|
||||
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
|
||||
To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in
|
||||
the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will
|
||||
instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self).
|
||||
**perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
|
||||
Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
|
||||
If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
|
||||
@@ -701,7 +695,8 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
**mems**:
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
@@ -738,7 +733,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
||||
self.dropout = nn.Dropout(config.dropout)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
|
||||
@@ -855,7 +850,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
|
||||
|
||||
qlen, bsz = input_ids.shape[0], input_ids.shape[1]
|
||||
mlen = mems[0].shape[0] if mems is not None else 0
|
||||
mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
|
||||
klen = mlen + qlen
|
||||
|
||||
dtype_float = next(self.parameters()).dtype
|
||||
@@ -1007,7 +1002,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
**mems**:
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
@@ -1038,7 +1034,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
self.transformer = XLNetModel(config)
|
||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
@@ -1087,7 +1083,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
**mems**:
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
@@ -1114,7 +1111,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
self.sequence_summary = SequenceSummary(config)
|
||||
self.logits_proj = nn.Linear(config.d_model, config.num_labels)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
||||
mems=None, perm_mask=None, target_mapping=None,
|
||||
@@ -1185,7 +1182,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
**mems**:
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
@@ -1215,7 +1213,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
self.end_logits = PoolerEndLogits(config)
|
||||
self.answer_class = PoolerAnswerClass(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
||||
mems=None, perm_mask=None, target_mapping=None,
|
||||
|
||||
@@ -21,7 +21,11 @@ import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
|
||||
from pytorch_transformers import (AutoConfig, BertConfig,
|
||||
AutoModel, BertModel,
|
||||
AutoModelWithLMHead, BertForMaskedLM,
|
||||
AutoModelForSequenceClassification, BertForSequenceClassification,
|
||||
AutoModelForQuestionAnswering, BertForQuestionAnswering)
|
||||
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
||||
@@ -42,6 +46,42 @@ class AutoModelTest(unittest.TestCase):
|
||||
for value in loading_info.values():
|
||||
self.assertEqual(len(value), 0)
|
||||
|
||||
def test_lmhead_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.assertIsNotNone(config)
|
||||
self.assertIsInstance(config, BertConfig)
|
||||
|
||||
model = AutoModelWithLMHead.from_pretrained(model_name)
|
||||
model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForMaskedLM)
|
||||
|
||||
def test_sequence_classification_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.assertIsNotNone(config)
|
||||
self.assertIsInstance(config, BertConfig)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
||||
model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForSequenceClassification)
|
||||
|
||||
def test_question_answering_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.assertIsNotNone(config)
|
||||
self.assertIsInstance(config, BertConfig)
|
||||
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
||||
model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForQuestionAnswering)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -21,6 +21,7 @@ import os
|
||||
import shutil
|
||||
import json
|
||||
import random
|
||||
import uuid
|
||||
|
||||
import unittest
|
||||
import logging
|
||||
@@ -48,6 +49,7 @@ class CommonTestCases:
|
||||
test_torchscript = True
|
||||
test_pruning = True
|
||||
test_resize_embeddings = True
|
||||
test_head_masking = True
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@@ -158,6 +160,10 @@ class CommonTestCases:
|
||||
|
||||
|
||||
def test_headmasking(self):
|
||||
if not self.test_head_masking:
|
||||
return
|
||||
|
||||
torch.manual_seed(42)
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
config.output_attentions = True
|
||||
@@ -207,9 +213,12 @@ class CommonTestCases:
|
||||
if not self.test_pruning:
|
||||
return
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
if "head_mask" in inputs_dict:
|
||||
del inputs_dict["head_mask"]
|
||||
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config=config)
|
||||
@@ -228,6 +237,120 @@ class CommonTestCases:
|
||||
self.assertEqual(
|
||||
attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||
|
||||
def test_head_pruning_save_load_from_pretrained(self):
|
||||
if not self.test_pruning:
|
||||
return
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
if "head_mask" in inputs_dict:
|
||||
del inputs_dict["head_mask"]
|
||||
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config=config)
|
||||
model.eval()
|
||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
-1: [0]}
|
||||
model.prune_heads(heads_to_prune)
|
||||
directory = "pruned_model"
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
model.save_pretrained(directory)
|
||||
model = model_class.from_pretrained(directory)
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
self.assertEqual(attentions[0].shape[-3], 1)
|
||||
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
||||
self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||
|
||||
shutil.rmtree(directory)
|
||||
|
||||
def test_head_pruning_save_load_from_config_init(self):
|
||||
if not self.test_pruning:
|
||||
return
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
if "head_mask" in inputs_dict:
|
||||
del inputs_dict["head_mask"]
|
||||
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
|
||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
-1: [0]}
|
||||
config.pruned_heads = heads_to_prune
|
||||
|
||||
model = model_class(config=config)
|
||||
model.eval()
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
|
||||
self.assertEqual(attentions[0].shape[-3], 1)
|
||||
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
||||
self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||
|
||||
def test_head_pruning_integration(self):
|
||||
if not self.test_pruning:
|
||||
return
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
if "head_mask" in inputs_dict:
|
||||
del inputs_dict["head_mask"]
|
||||
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
|
||||
heads_to_prune = {0: [0], 1: [1, 2]}
|
||||
config.pruned_heads = heads_to_prune
|
||||
|
||||
model = model_class(config=config)
|
||||
model.eval()
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
|
||||
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
|
||||
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
|
||||
|
||||
directory = "pruned_model"
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
model.save_pretrained(directory)
|
||||
model = model_class.from_pretrained(directory)
|
||||
shutil.rmtree(directory)
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
|
||||
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
|
||||
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
|
||||
|
||||
heads_to_prune = {0: [0], 2: [1, 2]}
|
||||
model.prune_heads(heads_to_prune)
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
|
||||
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
|
||||
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
|
||||
|
||||
self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
|
||||
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
@@ -281,6 +404,9 @@ class CommonTestCases:
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
def test_tie_model_weights(self):
|
||||
if not self.test_torchscript:
|
||||
return
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
def check_same_values(layer_1, layer_2):
|
||||
@@ -527,7 +653,7 @@ class ConfigTester(object):
|
||||
|
||||
def create_and_test_config_to_json_file(self):
|
||||
config_first = self.config_class(**self.inputs_dict)
|
||||
json_file_path = "/tmp/config.json"
|
||||
json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
|
||||
config_first.to_json_file(json_file_path)
|
||||
config_second = self.config_class.from_json_file(json_file_path)
|
||||
os.remove(json_file_path)
|
||||
|
||||
217
pytorch_transformers/tests/modeling_distilbert_test.py
Normal file
217
pytorch_transformers/tests/modeling_distilbert_test.py
Normal file
@@ -0,0 +1,217 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
|
||||
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
|
||||
from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
||||
|
||||
|
||||
class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
|
||||
DistilBertForSequenceClassification)
|
||||
test_pruning = True
|
||||
test_torchscript = True
|
||||
test_resize_embeddings = True
|
||||
test_head_masking = True
|
||||
|
||||
class DistilBertModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_token_type_ids=False,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = DistilBertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
dim=self.hidden_size,
|
||||
n_layers=self.num_hidden_layers,
|
||||
n_heads=self.num_attention_heads,
|
||||
hidden_dim=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
dropout=self.hidden_dropout_prob,
|
||||
attention_dropout=self.attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
initializer_range=self.initializer_range)
|
||||
|
||||
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
def check_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
|
||||
def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertModel(config=config)
|
||||
model.eval()
|
||||
(sequence_output,) = model(input_ids, input_mask)
|
||||
(sequence_output,) = model(input_ids)
|
||||
|
||||
result = {
|
||||
"sequence_output": sequence_output,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].size()),
|
||||
[self.batch_size, self.seq_length, self.hidden_size])
|
||||
|
||||
def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertForMaskedLM(config=config)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"prediction_scores": prediction_scores,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["prediction_scores"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
self.check_loss_output(result)
|
||||
|
||||
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertForQuestionAnswering(config=config)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"start_logits": start_logits,
|
||||
"end_logits": end_logits,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["start_logits"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
self.parent.assertListEqual(
|
||||
list(result["end_logits"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
self.check_loss_output(result)
|
||||
|
||||
def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = DistilBertForSequenceClassification(config)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, input_mask, sequence_labels)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"logits": logits,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].size()),
|
||||
[self.batch_size, self.num_labels])
|
||||
self.check_loss_output(result)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||
inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = DistilBertModelTest.DistilBertModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_distilbert_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
|
||||
|
||||
def test_for_masked_lm(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
|
||||
|
||||
def test_for_question_answering(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
|
||||
|
||||
def test_for_sequence_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
|
||||
|
||||
# @pytest.mark.slow
|
||||
# def test_model_from_pretrained(self):
|
||||
# cache_dir = "/tmp/pytorch_transformers_test/"
|
||||
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
# model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
# shutil.rmtree(cache_dir)
|
||||
# self.assertIsNotNone(model)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -18,31 +18,196 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
|
||||
|
||||
from pytorch_transformers import (GPT2Config, GPT2Model,
|
||||
from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||
|
||||
from .modeling_common_test import CommonTestCases, ConfigTester
|
||||
from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor
|
||||
|
||||
class GPT2ModelTest(unittest.TestCase):
|
||||
|
||||
class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||
|
||||
class GPT2ModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = GPT2Config(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
# intermediate_size=self.intermediate_size,
|
||||
# hidden_act=self.hidden_act,
|
||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range
|
||||
)
|
||||
|
||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||
|
||||
return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
|
||||
|
||||
def check_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
|
||||
def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = GPT2Model(config=config)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
model(input_ids, token_type_ids=token_type_ids)
|
||||
sequence_output, presents = model(input_ids)
|
||||
|
||||
result = {
|
||||
"sequence_output": sequence_output,
|
||||
"presents": presents,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].size()),
|
||||
[self.batch_size, self.seq_length, self.hidden_size])
|
||||
self.parent.assertEqual(len(result["presents"]), config.n_layer)
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = GPT2LMHeadModel(config)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
|
||||
result = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits
|
||||
}
|
||||
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
|
||||
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = GPT2DoubleHeadsModel(config)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
||||
|
||||
result = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits
|
||||
}
|
||||
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||
inputs_dict = {
|
||||
'input_ids': input_ids,
|
||||
'token_type_ids': token_type_ids,
|
||||
'head_mask': head_mask
|
||||
}
|
||||
|
||||
return config, inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
||||
|
||||
def test_config(self):
|
||||
config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
||||
config_tester.run_common_tests()
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
|
||||
lm_head_model_class=GPT2LMHeadModel,
|
||||
double_head_model_class=GPT2DoubleHeadsModel)
|
||||
model_tester.run_common_tests(test_presents=True)
|
||||
def test_gpt2_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
|
||||
|
||||
def test_gpt2_lm_head_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
||||
|
||||
def test_gpt2_double_lm_head_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_pretrained(self):
|
||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
|
||||
lm_head_model_class=GPT2LMHeadModel,
|
||||
double_head_model_class=GPT2DoubleHeadsModel)
|
||||
model_tester.run_slow_tests()
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/pytorch_transformers_test/"
|
||||
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
shutil.rmtree(cache_dir)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -18,31 +18,194 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
|
||||
|
||||
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||
|
||||
from .modeling_common_test import CommonTestCases, ConfigTester
|
||||
from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor
|
||||
|
||||
class OpenAIModelTest(unittest.TestCase):
|
||||
|
||||
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||
|
||||
class OpenAIGPTModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = OpenAIGPTConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
n_head=self.num_attention_heads,
|
||||
# intermediate_size=self.intermediate_size,
|
||||
# hidden_act=self.hidden_act,
|
||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
n_positions=self.max_position_embeddings,
|
||||
n_ctx=self.max_position_embeddings
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range
|
||||
)
|
||||
|
||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||
|
||||
return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
|
||||
|
||||
def check_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
|
||||
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTModel(config=config)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
model(input_ids, token_type_ids=token_type_ids)
|
||||
(sequence_output,) = model(input_ids)
|
||||
|
||||
result = {
|
||||
"sequence_output": sequence_output
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].size()),
|
||||
[self.batch_size, self.seq_length, self.hidden_size])
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTLMHeadModel(config)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
|
||||
result = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits
|
||||
}
|
||||
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
|
||||
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTDoubleHeadsModel(config)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
||||
|
||||
result = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits
|
||||
}
|
||||
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||
inputs_dict = {
|
||||
'input_ids': input_ids,
|
||||
'token_type_ids': token_type_ids,
|
||||
'head_mask': head_mask
|
||||
}
|
||||
|
||||
return config, inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
||||
|
||||
def test_config(self):
|
||||
config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
||||
config_tester.run_common_tests()
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
|
||||
lm_head_model_class=OpenAIGPTLMHeadModel,
|
||||
double_head_model_class=OpenAIGPTDoubleHeadsModel)
|
||||
model_tester.run_common_tests(test_presents=False)
|
||||
def test_openai_gpt_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
|
||||
|
||||
def test_openai_gpt_lm_head_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
||||
|
||||
def test_openai_gpt_double_lm_head_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_pretrained(self):
|
||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
|
||||
lm_head_model_class=OpenAIGPTLMHeadModel,
|
||||
double_head_model_class=OpenAIGPTDoubleHeadsModel)
|
||||
model_tester.run_slow_tests()
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/pytorch_transformers_test/"
|
||||
for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
shutil.rmtree(cache_dir)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -41,8 +41,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
def get_tokenizer(self):
|
||||
return BertTokenizer.from_pretrained(self.tmpdirname)
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"UNwant\u00E9d,running"
|
||||
@@ -50,7 +50,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
return input_text, output_text
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
tokenizer = self.tokenizer_class(self.vocab_file)
|
||||
|
||||
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
|
||||
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
|
||||
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
self.assertFalse(_is_punctuation(u" "))
|
||||
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
|
||||
|
||||
text = tokenizer.encode("sequence builders")
|
||||
text_2 = tokenizer.encode("multi-sequence build")
|
||||
|
||||
46
pytorch_transformers/tests/tokenization_dilbert_test.py
Normal file
46
pytorch_transformers/tests/tokenization_dilbert_test.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from io import open
|
||||
|
||||
from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
|
||||
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
from .tokenization_bert_test import BertTokenizationTest
|
||||
|
||||
class DistilBertTokenizationTest(BertTokenizationTest):
|
||||
|
||||
tokenizer_class = DistilBertTokenizer
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
|
||||
text = tokenizer.encode("sequence builders")
|
||||
text_2 = tokenizer.encode("multi-sequence build")
|
||||
|
||||
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
|
||||
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
|
||||
|
||||
assert encoded_sentence == [101] + text + [102]
|
||||
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
import os
|
||||
import unittest
|
||||
import json
|
||||
from io import open
|
||||
|
||||
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
|
||||
|
||||
@@ -31,36 +32,38 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||
"lo", "low", "er",
|
||||
"low", "lowest", "newer", "wider", "<unk>"]
|
||||
"\u0120", "\u0120l", "\u0120n",
|
||||
"\u0120lo", "\u0120low", "er",
|
||||
"\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
|
||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(self.vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(self.merges_file, "w") as fp:
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
def get_tokenizer(self):
|
||||
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
|
||||
def get_tokenizer(self, **kwargs):
|
||||
kwargs.update(self.special_tokens_map)
|
||||
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower<unk>newer"
|
||||
output_text = u" lower newer"
|
||||
return input_text, output_text
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er"]
|
||||
text = "lower newer"
|
||||
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + [tokenizer.unk_token]
|
||||
input_bpe_tokens = [13, 12, 17]
|
||||
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
|
||||
@@ -45,8 +45,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
with open(self.merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
def get_tokenizer(self):
|
||||
return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"lower newer"
|
||||
|
||||
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
import os
|
||||
import json
|
||||
import unittest
|
||||
from io import open
|
||||
|
||||
from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
@@ -30,36 +31,38 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||
"lo", "low", "er",
|
||||
"low", "lowest", "newer", "wider", "<unk>"]
|
||||
"\u0120", "\u0120l", "\u0120n",
|
||||
"\u0120lo", "\u0120low", "er",
|
||||
"\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
|
||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(self.vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(self.merges_file, "w") as fp:
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
def get_tokenizer(self):
|
||||
return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
|
||||
def get_tokenizer(self, **kwargs):
|
||||
kwargs.update(self.special_tokens_map)
|
||||
return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower<unk>newer"
|
||||
output_text = u" lower newer"
|
||||
return input_text, output_text
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er"]
|
||||
text = "lower newer"
|
||||
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + [tokenizer.unk_token]
|
||||
input_bpe_tokens = [13, 12, 17]
|
||||
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
|
||||
@@ -49,23 +49,32 @@ class CommonTestCases:
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
def get_tokenizer(self):
|
||||
def get_tokenizer(self, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_input_output_texts(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def test_save_and_load_tokenizer(self):
|
||||
# safety check on max_len default value so we are sure the test works
|
||||
tokenizer = self.get_tokenizer()
|
||||
self.assertNotEqual(tokenizer.max_len, 42)
|
||||
|
||||
# Now let's start the test
|
||||
tokenizer = self.get_tokenizer(max_len=42)
|
||||
|
||||
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
tokenizer.save_pretrained(tmpdirname)
|
||||
tokenizer = tokenizer.from_pretrained(tmpdirname)
|
||||
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
|
||||
|
||||
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||
self.assertListEqual(before_tokens, after_tokens)
|
||||
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||
self.assertListEqual(before_tokens, after_tokens)
|
||||
|
||||
self.assertEqual(tokenizer.max_len, 42)
|
||||
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
|
||||
self.assertEqual(tokenizer.max_len, 43)
|
||||
|
||||
def test_pickle_tokenizer(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
@@ -95,7 +104,7 @@ class CommonTestCases:
|
||||
self.assertNotEqual(vocab_size, 0)
|
||||
self.assertEqual(vocab_size, all_size)
|
||||
|
||||
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
|
||||
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
|
||||
added_toks = tokenizer.add_tokens(new_toks)
|
||||
vocab_size_2 = tokenizer.vocab_size
|
||||
all_size_2 = len(tokenizer)
|
||||
@@ -105,13 +114,15 @@ class CommonTestCases:
|
||||
self.assertEqual(added_toks, len(new_toks))
|
||||
self.assertEqual(all_size_2, all_size + len(new_toks))
|
||||
|
||||
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
|
||||
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
|
||||
out_string = tokenizer.decode(tokens)
|
||||
|
||||
self.assertGreaterEqual(len(tokens), 4)
|
||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
|
||||
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
|
||||
'pad_token': "<<<<<|||>|>>>>|>"}
|
||||
'pad_token': "<<<<<|||>|>>>>|>"}
|
||||
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
||||
vocab_size_3 = tokenizer.vocab_size
|
||||
all_size_3 = len(tokenizer)
|
||||
@@ -122,14 +133,15 @@ class CommonTestCases:
|
||||
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
||||
|
||||
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
|
||||
out_string = tokenizer.decode(tokens)
|
||||
|
||||
self.assertGreaterEqual(len(tokens), 6)
|
||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[0], tokens[1])
|
||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[-2], tokens[-3])
|
||||
self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
|
||||
self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
|
||||
self.assertEqual(tokens[0], tokenizer.eos_token_id)
|
||||
self.assertEqual(tokens[-2], tokenizer.pad_token_id)
|
||||
|
||||
|
||||
def test_required_methods_tokenizer(self):
|
||||
|
||||
@@ -37,8 +37,9 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
def get_tokenizer(self):
|
||||
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
|
||||
def get_tokenizer(self, **kwargs):
|
||||
kwargs['lower_case'] = True
|
||||
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"<unk> UNwanted , running"
|
||||
|
||||
@@ -44,8 +44,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
with open(self.merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
def get_tokenizer(self):
|
||||
return XLMTokenizer.from_pretrained(self.tmpdirname)
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"lower newer"
|
||||
|
||||
@@ -35,8 +35,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
def get_tokenizer(self):
|
||||
return XLNetTokenizer.from_pretrained(self.tmpdirname)
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"This is a test"
|
||||
|
||||
@@ -24,6 +24,8 @@ from .tokenization_gpt2 import GPT2Tokenizer
|
||||
from .tokenization_transfo_xl import TransfoXLTokenizer
|
||||
from .tokenization_xlnet import XLNetTokenizer
|
||||
from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
from .tokenization_distilbert import DistilBertTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -38,6 +40,8 @@ class AutoTokenizer(object):
|
||||
|
||||
The tokenizer class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
|
||||
- contains `roberta`: RobertaTokenizer (RoBERTa model)
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||
@@ -58,6 +62,8 @@ class AutoTokenizer(object):
|
||||
|
||||
The tokenizer class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
|
||||
- contains `roberta`: RobertaTokenizer (XLM model)
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||
@@ -66,23 +72,37 @@ class AutoTokenizer(object):
|
||||
- contains `xlm`: XLMTokenizer (XLM model)
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a saved configuration `file`.
|
||||
**cache_dir**: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the vocabulary files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
|
||||
|
||||
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
|
||||
|
||||
Examples::
|
||||
|
||||
config = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||
config = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||
|
||||
"""
|
||||
if 'bert' in pretrained_model_name_or_path:
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
@@ -97,4 +117,4 @@ class AutoTokenizer(object):
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm'".format(pretrained_model_name_or_path))
|
||||
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||
|
||||
@@ -63,6 +63,23 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'bert-base-cased-finetuned-mrpc': 512,
|
||||
}
|
||||
|
||||
PRETRAINED_INIT_CONFIGURATION = {
|
||||
'bert-base-uncased': {'do_lower_case': True},
|
||||
'bert-large-uncased': {'do_lower_case': True},
|
||||
'bert-base-cased': {'do_lower_case': False},
|
||||
'bert-large-cased': {'do_lower_case': False},
|
||||
'bert-base-multilingual-uncased': {'do_lower_case': True},
|
||||
'bert-base-multilingual-cased': {'do_lower_case': False},
|
||||
'bert-base-chinese': {'do_lower_case': False},
|
||||
'bert-base-german-cased': {'do_lower_case': False},
|
||||
'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
|
||||
'bert-large-cased-whole-word-masking': {'do_lower_case': False},
|
||||
'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
|
||||
'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
|
||||
'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
|
||||
}
|
||||
|
||||
|
||||
def load_vocab(vocab_file):
|
||||
"""Loads a vocabulary file into a dictionary."""
|
||||
vocab = collections.OrderedDict()
|
||||
@@ -100,6 +117,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
|
||||
@@ -125,6 +143,9 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
|
||||
pad_token=pad_token, cls_token=cls_token,
|
||||
mask_token=mask_token, **kwargs)
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
|
||||
@@ -171,15 +192,15 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
Adds special tokens to the a sequence for sequence classification tasks.
|
||||
A BERT sequence has the following format: [CLS] X [SEP]
|
||||
"""
|
||||
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
|
||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
||||
|
||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||
"""
|
||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
||||
"""
|
||||
sep = [self._convert_token_to_id(self.sep_token)]
|
||||
cls = [self._convert_token_to_id(self.cls_token)]
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||
|
||||
def save_vocabulary(self, vocab_path):
|
||||
@@ -187,6 +208,8 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
index = 0
|
||||
if os.path.isdir(vocab_path):
|
||||
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
|
||||
else:
|
||||
vocab_file = vocab_path
|
||||
with open(vocab_file, "w", encoding="utf-8") as writer:
|
||||
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
@@ -197,24 +220,6 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
index += 1
|
||||
return (vocab_file,)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
""" Instantiate a BertTokenizer from pre-trained vocabulary files.
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
|
||||
if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
|
||||
logger.warning("The pre-trained model you are loading is a cased model but you have not set "
|
||||
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
|
||||
"you may want to check this behavior.")
|
||||
kwargs['do_lower_case'] = False
|
||||
elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
|
||||
logger.warning("The pre-trained model you are loading is an uncased model but you have set "
|
||||
"`do_lower_case` to False. We are setting `do_lower_case=True` for you "
|
||||
"but you may want to check this behavior.")
|
||||
kwargs['do_lower_case'] = True
|
||||
|
||||
return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
|
||||
|
||||
class BasicTokenizer(object):
|
||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
||||
|
||||
62
pytorch_transformers/tokenization_distilbert.py
Normal file
62
pytorch_transformers/tokenization_distilbert.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for DistilBERT."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import collections
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
from .tokenization_bert import BertTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
'vocab_file':
|
||||
{
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
|
||||
}
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'distilbert-base-uncased': 512,
|
||||
'distilbert-base-uncased-distilled-squad': 512,
|
||||
}
|
||||
|
||||
|
||||
class DistilBertTokenizer(BertTokenizer):
|
||||
r"""
|
||||
Constructs a DistilBertTokenizer.
|
||||
:class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
|
||||
|
||||
Args:
|
||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
||||
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
|
||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
|
||||
minimum of this value (if specified) and the underlying BERT model's sequence length.
|
||||
never_split: List of tokens which will never be split during tokenization. Only has an effect when
|
||||
do_wordpiece_only=False
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
@@ -45,29 +45,33 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
{
|
||||
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
|
||||
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
|
||||
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
|
||||
},
|
||||
'merges_file':
|
||||
{
|
||||
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
|
||||
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
|
||||
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
|
||||
},
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'gpt2': 1024,
|
||||
'gpt2-medium': 1024,
|
||||
'gpt2-large': 1024,
|
||||
}
|
||||
|
||||
@lru_cache()
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
Returns list of utf-8 byte and a mapping to unicode strings.
|
||||
We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
_chr = unichr if sys.version_info[0] == 2 else chr
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
@@ -96,7 +100,10 @@ def get_pairs(word):
|
||||
class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
GPT-2 BPE tokenizer. Peculiarities:
|
||||
- Byte-level BPE
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => will add a space is there isn't.
|
||||
As a consequence, this tokenizer `encode` and `decode` method will not conserve
|
||||
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
"""
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
@@ -105,12 +112,14 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
|
||||
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
|
||||
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
|
||||
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
self.encoder = json.load(open(vocab_file))
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
@@ -166,12 +175,13 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _tokenize(self, text):
|
||||
""" Tokenize a string. """
|
||||
text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
if sys.version_info[0] == 2:
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||
else:
|
||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
@@ -211,4 +221,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file
|
||||
return vocab_file, merge_file
|
||||
@@ -87,10 +87,14 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
|
||||
|
||||
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
import spacy
|
||||
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
|
||||
from spacy.lang.en import English
|
||||
_nlp = English()
|
||||
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
|
||||
self.fix_text = ftfy.fix_text
|
||||
except ImportError:
|
||||
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
|
||||
|
||||
@@ -23,8 +23,7 @@ import os
|
||||
import regex as re
|
||||
from io import open
|
||||
|
||||
from .tokenization_gpt2 import bytes_to_unicode, get_pairs
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
|
||||
try:
|
||||
from functools import lru_cache
|
||||
@@ -63,9 +62,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
}
|
||||
|
||||
|
||||
class RobertaTokenizer(PreTrainedTokenizer):
|
||||
class RobertaTokenizer(GPT2Tokenizer):
|
||||
"""
|
||||
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
|
||||
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => will add a space is there isn't.
|
||||
As a consequence, this tokenizer `encode` and `decode` method will not conserve
|
||||
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
"""
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
@@ -73,129 +76,23 @@ class RobertaTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
|
||||
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
|
||||
super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
|
||||
super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
|
||||
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
|
||||
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
|
||||
mask_token=mask_token, **kwargs)
|
||||
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
self.cache = {}
|
||||
|
||||
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
word = tuple(token)
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
return token
|
||||
|
||||
while True:
|
||||
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word)-1 and word[i+1] == second:
|
||||
new_word.append(first+second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def _tokenize(self, text):
|
||||
""" Tokenize a string. """
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
if sys.version_info[0] == 2:
|
||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||
else:
|
||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
return self.decoder.get(index)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
""" Converts a sequence of tokens (string) in a single string. """
|
||||
text = ''.join(tokens)
|
||||
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
|
||||
return text
|
||||
|
||||
def add_special_tokens_single_sentence(self, token_ids):
|
||||
"""
|
||||
Adds special tokens to a sequence for sequence classification tasks.
|
||||
A RoBERTa sequence has the following format: [CLS] X [SEP]
|
||||
A RoBERTa sequence has the following format: <s> X </s>
|
||||
"""
|
||||
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
|
||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
||||
|
||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||
"""
|
||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||
A RoBERTa sequence pair has the following format: [CLS] A [SEP][SEP] B [SEP]
|
||||
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
|
||||
"""
|
||||
sep = [self._convert_token_to_id(self.sep_token)]
|
||||
cls = [self._convert_token_to_id(self.cls_token)]
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
return
|
||||
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
|
||||
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
|
||||
|
||||
with open(vocab_file, 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
||||
|
||||
index = 0
|
||||
with open(merge_file, "w", encoding="utf-8") as writer:
|
||||
writer.write(u'#version: 0.2\n')
|
||||
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
|
||||
" Please check that the tokenizer is not corrupted!".format(merge_file))
|
||||
index = token_index
|
||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file
|
||||
|
||||
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs)
|
||||
|
||||
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
if never_split is None:
|
||||
never_split = self.all_special_tokens
|
||||
if special is None:
|
||||
@@ -91,7 +95,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
# in a library like ours, at all.
|
||||
vocab_dict = torch.load(pretrained_vocab_file)
|
||||
for key, value in vocab_dict.items():
|
||||
self.__dict__[key] = value
|
||||
if key not in self.__dict__:
|
||||
self.__dict__[key] = value
|
||||
|
||||
if vocab_file is not None:
|
||||
self.build_vocab()
|
||||
|
||||
@@ -20,6 +20,7 @@ import logging
|
||||
import os
|
||||
import json
|
||||
import six
|
||||
import copy
|
||||
from io import open
|
||||
|
||||
from .file_utils import cached_path
|
||||
@@ -28,6 +29,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
|
||||
ADDED_TOKENS_FILE = 'added_tokens.json'
|
||||
TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
|
||||
|
||||
class PreTrainedTokenizer(object):
|
||||
""" Base class for all tokenizers.
|
||||
@@ -40,27 +42,29 @@ class PreTrainedTokenizer(object):
|
||||
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
|
||||
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
|
||||
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
|
||||
- ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
|
||||
|
||||
Parameters:
|
||||
|
||||
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
|
||||
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
|
||||
|
||||
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
|
||||
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
|
||||
|
||||
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
|
||||
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
|
||||
|
||||
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
|
||||
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
|
||||
|
||||
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
|
||||
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
|
||||
|
||||
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
|
||||
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
|
||||
|
||||
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
|
||||
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
|
||||
|
||||
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
|
||||
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
|
||||
"""
|
||||
vocab_files_names = {}
|
||||
pretrained_vocab_files_map = {}
|
||||
pretrained_init_configuration = {}
|
||||
max_model_input_sizes = {}
|
||||
|
||||
SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
|
||||
@@ -155,6 +159,62 @@ class PreTrainedTokenizer(object):
|
||||
def additional_special_tokens(self, value):
|
||||
self._additional_special_tokens = value
|
||||
|
||||
@property
|
||||
def bos_token_id(self):
|
||||
""" Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
|
||||
if self._bos_token is None:
|
||||
logger.error("Using bos_token, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._bos_token)
|
||||
|
||||
@property
|
||||
def eos_token_id(self):
|
||||
""" Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
|
||||
if self._eos_token is None:
|
||||
logger.error("Using eos_token, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._eos_token)
|
||||
|
||||
@property
|
||||
def unk_token_is(self):
|
||||
""" Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
|
||||
if self._unk_token is None:
|
||||
logger.error("Using unk_token, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._unk_token)
|
||||
|
||||
@property
|
||||
def sep_token_id(self):
|
||||
""" Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
|
||||
if self._sep_token is None:
|
||||
logger.error("Using sep_token, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._sep_token)
|
||||
|
||||
@property
|
||||
def pad_token_id(self):
|
||||
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """
|
||||
if self._pad_token is None:
|
||||
logger.error("Using pad_token, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._pad_token)
|
||||
|
||||
@property
|
||||
def cls_token_id(self):
|
||||
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
|
||||
if self._cls_token is None:
|
||||
logger.error("Using cls_token, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._cls_token)
|
||||
|
||||
@property
|
||||
def mask_token_id(self):
|
||||
""" Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
|
||||
if self._mask_token is None:
|
||||
logger.error("Using mask_token, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._mask_token)
|
||||
|
||||
@property
|
||||
def additional_special_tokens_ids(self):
|
||||
""" Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
|
||||
if self._additional_special_tokens is None:
|
||||
logger.error("Using additional_special_tokens, but it is not set yet.")
|
||||
return self.convert_tokens_to_ids(self._additional_special_tokens)
|
||||
|
||||
def __init__(self, max_len=None, **kwargs):
|
||||
self._bos_token = None
|
||||
self._eos_token = None
|
||||
@@ -166,9 +226,15 @@ class PreTrainedTokenizer(object):
|
||||
self._additional_special_tokens = []
|
||||
|
||||
self.max_len = max_len if max_len is not None else int(1e12)
|
||||
|
||||
# Added tokens
|
||||
self.added_tokens_encoder = {}
|
||||
self.added_tokens_decoder = {}
|
||||
|
||||
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
|
||||
self.init_inputs = ()
|
||||
self.init_kwargs = {}
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
||||
if key == 'additional_special_tokens':
|
||||
@@ -193,6 +259,13 @@ class PreTrainedTokenizer(object):
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the vocabulary files and override the cached versions if they exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
|
||||
|
||||
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
|
||||
@@ -221,15 +294,20 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
force_download = kwargs.pop('force_download', False)
|
||||
proxies = kwargs.pop('proxies', None)
|
||||
|
||||
s3_models = list(cls.max_model_input_sizes.keys())
|
||||
vocab_files = {}
|
||||
init_configuration = {}
|
||||
if pretrained_model_name_or_path in s3_models:
|
||||
# Get the vocabulary from AWS S3 bucket
|
||||
for file_id, map_list in cls.pretrained_vocab_files_map.items():
|
||||
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
|
||||
if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
|
||||
init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
|
||||
else:
|
||||
# Get the vocabulary from local files
|
||||
logger.info(
|
||||
@@ -252,15 +330,17 @@ class PreTrainedTokenizer(object):
|
||||
vocab_files[file_id] = full_file_name
|
||||
|
||||
# Look for the additional tokens files
|
||||
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
|
||||
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
|
||||
additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
|
||||
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
|
||||
'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
|
||||
}
|
||||
|
||||
# If a path to a file was provided, get the parent directory
|
||||
saved_directory = pretrained_model_name_or_path
|
||||
if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
|
||||
saved_directory = os.path.dirname(saved_directory)
|
||||
|
||||
for file_id, file_name in all_vocab_files_names.items():
|
||||
for file_id, file_name in additional_files_names.items():
|
||||
full_file_name = os.path.join(saved_directory, file_name)
|
||||
if not os.path.exists(full_file_name):
|
||||
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
|
||||
@@ -283,8 +363,8 @@ class PreTrainedTokenizer(object):
|
||||
if file_path is None:
|
||||
resolved_vocab_files[file_id] = None
|
||||
else:
|
||||
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
except EnvironmentError as e:
|
||||
if pretrained_model_name_or_path in s3_models:
|
||||
logger.error("Couldn't reach server to download vocabulary.")
|
||||
else:
|
||||
@@ -294,7 +374,7 @@ class PreTrainedTokenizer(object):
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path, ', '.join(s3_models),
|
||||
pretrained_model_name_or_path, str(vocab_files.keys())))
|
||||
return None
|
||||
raise e
|
||||
|
||||
for file_id, file_path in vocab_files.items():
|
||||
if file_path == resolved_vocab_files[file_id]:
|
||||
@@ -303,28 +383,46 @@ class PreTrainedTokenizer(object):
|
||||
logger.info("loading file {} from cache at {}".format(
|
||||
file_path, resolved_vocab_files[file_id]))
|
||||
|
||||
# Prepare tokenizer initialization kwargs
|
||||
# Did we saved some inputs and kwargs to reload ?
|
||||
tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
|
||||
if tokenizer_config_file is not None:
|
||||
init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
|
||||
saved_init_inputs = init_kwargs.pop('init_inputs', ())
|
||||
if not init_inputs:
|
||||
init_inputs = saved_init_inputs
|
||||
else:
|
||||
init_kwargs = init_configuration
|
||||
|
||||
# Update with newly provided kwargs
|
||||
init_kwargs.update(kwargs)
|
||||
|
||||
# Set max length if needed
|
||||
if pretrained_model_name_or_path in cls.max_model_input_sizes:
|
||||
# if we're using a pretrained model, ensure the tokenizer
|
||||
# wont index sequences longer than the number of positional embeddings
|
||||
max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
|
||||
if max_len is not None and isinstance(max_len, (int, float)):
|
||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||
init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
|
||||
|
||||
# Merge resolved_vocab_files arguments in kwargs.
|
||||
# Merge resolved_vocab_files arguments in init_kwargs.
|
||||
added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
|
||||
special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
|
||||
for args_name, file_path in resolved_vocab_files.items():
|
||||
if args_name not in kwargs:
|
||||
kwargs[args_name] = file_path
|
||||
if args_name not in init_kwargs:
|
||||
init_kwargs[args_name] = file_path
|
||||
if special_tokens_map_file is not None:
|
||||
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
|
||||
for key, value in special_tokens_map.items():
|
||||
if key not in kwargs:
|
||||
kwargs[key] = value
|
||||
if key not in init_kwargs:
|
||||
init_kwargs[key] = value
|
||||
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(*inputs, **kwargs)
|
||||
tokenizer = cls(*init_inputs, **init_kwargs)
|
||||
|
||||
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
|
||||
tokenizer.init_inputs = init_inputs
|
||||
tokenizer.init_kwargs = init_kwargs
|
||||
|
||||
# Add supplementary tokens.
|
||||
if added_tokens_file is not None:
|
||||
@@ -337,8 +435,13 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save the tokenizer vocabulary files (with added tokens) and the
|
||||
special-tokens-to-class-attributes-mapping to a directory.
|
||||
""" Save the tokenizer vocabulary files together with:
|
||||
- added tokens,
|
||||
- special-tokens-to-class-attributes-mapping,
|
||||
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
|
||||
|
||||
This won't save modifications other than (added tokens and special token mapping) you may have
|
||||
applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
|
||||
|
||||
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
|
||||
"""
|
||||
@@ -348,6 +451,15 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
|
||||
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
|
||||
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
|
||||
|
||||
tokenizer_config = copy.deepcopy(self.init_kwargs)
|
||||
tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
|
||||
for file_id in self.vocab_files_names.keys():
|
||||
tokenizer_config.pop(file_id, None)
|
||||
|
||||
with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
|
||||
|
||||
with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
|
||||
@@ -429,6 +541,13 @@ class PreTrainedTokenizer(object):
|
||||
to class attributes. If special tokens are NOT in the vocabulary, they are added
|
||||
to it (indexed starting from the last index of the current vocabulary).
|
||||
|
||||
Using `add_special_tokens` will ensure your special tokens can be used in several ways:
|
||||
|
||||
- special tokens are carefully handled by the tokenizer (they are never split)
|
||||
- you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
|
||||
|
||||
When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
|
||||
|
||||
Args:
|
||||
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
|
||||
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
|
||||
@@ -477,15 +596,45 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
Take care of added tokens.
|
||||
"""
|
||||
def split_on_token(tok, text):
|
||||
result = []
|
||||
split_text = text.split(tok)
|
||||
for i, sub_text in enumerate(split_text):
|
||||
sub_text = sub_text.strip()
|
||||
if i == 0 and not sub_text:
|
||||
result += [tok]
|
||||
elif i == len(split_text) - 1:
|
||||
if sub_text:
|
||||
result += [sub_text]
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
if sub_text:
|
||||
result += [sub_text]
|
||||
result += [tok]
|
||||
return result
|
||||
|
||||
def split_on_tokens(tok_list, text):
|
||||
if not text:
|
||||
return []
|
||||
if not tok_list:
|
||||
return self._tokenize(text, **kwargs)
|
||||
tok = tok_list[0]
|
||||
split_text = text.split(tok)
|
||||
return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
|
||||
for sub_text in split_text), [])[:-1]
|
||||
|
||||
tokenized_text = []
|
||||
text_list = [text]
|
||||
for tok in tok_list:
|
||||
tokenized_text = []
|
||||
for sub_text in text_list:
|
||||
if sub_text not in self.added_tokens_encoder \
|
||||
and sub_text not in self.all_special_tokens:
|
||||
tokenized_text += split_on_token(tok, sub_text)
|
||||
else:
|
||||
tokenized_text += [sub_text]
|
||||
text_list = tokenized_text
|
||||
|
||||
return sum((self._tokenize(token, **kwargs) if token not \
|
||||
in self.added_tokens_encoder and token not in self.all_special_tokens \
|
||||
else [token] for token in tokenized_text), [])
|
||||
|
||||
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
|
||||
tokenized_text = split_on_tokens(added_tokens, text)
|
||||
@@ -524,7 +673,7 @@ class PreTrainedTokenizer(object):
|
||||
def _convert_token_to_id(self, token):
|
||||
raise NotImplementedError
|
||||
|
||||
def encode(self, text, text_pair=None, add_special_tokens=False):
|
||||
def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
|
||||
"""
|
||||
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
|
||||
|
||||
@@ -535,15 +684,16 @@ class PreTrainedTokenizer(object):
|
||||
text_pair: Optional second sequence to be encoded.
|
||||
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
|
||||
to their model.
|
||||
**kwargs: passed to the `self.tokenize()` method
|
||||
"""
|
||||
if text_pair is None:
|
||||
if add_special_tokens:
|
||||
return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
|
||||
return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
|
||||
else:
|
||||
return self.convert_tokens_to_ids(self.tokenize(text))
|
||||
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
|
||||
|
||||
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
|
||||
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
|
||||
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
|
||||
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
|
||||
|
||||
if add_special_tokens:
|
||||
return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
|
||||
@@ -551,10 +701,12 @@ class PreTrainedTokenizer(object):
|
||||
return first_sentence_tokens, second_sentence_tokens
|
||||
|
||||
def add_special_tokens_single_sentence(self, token_ids):
|
||||
raise NotImplementedError
|
||||
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
||||
return token_ids
|
||||
|
||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||
raise NotImplementedError
|
||||
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
|
||||
return token_ids_0 + token_ids_1
|
||||
|
||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||
@@ -570,7 +722,7 @@ class PreTrainedTokenizer(object):
|
||||
return self._convert_id_to_token(ids)
|
||||
tokens = []
|
||||
for index in ids:
|
||||
if index in self.all_special_ids and skip_special_tokens:
|
||||
if skip_special_tokens and index in self.all_special_ids:
|
||||
continue
|
||||
if index in self.added_tokens_decoder:
|
||||
tokens.append(self.added_tokens_decoder[index])
|
||||
@@ -595,11 +747,29 @@ class PreTrainedTokenizer(object):
|
||||
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
||||
"""
|
||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
text = self.convert_tokens_to_string(filtered_tokens)
|
||||
|
||||
if self.sep_token is not None and self.sep_token in text:
|
||||
text = text.replace(self.cls_token, self.sep_token)
|
||||
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token)))
|
||||
# To avoid mixing byte-level and unicode for byte-level BPT
|
||||
# we need to build string separatly for added tokens and byte-level tokens
|
||||
# cf. https://github.com/huggingface/pytorch-transformers/issues/1133
|
||||
sub_texts = []
|
||||
current_sub_text = []
|
||||
for token in filtered_tokens:
|
||||
if skip_special_tokens and token in self.all_special_ids:
|
||||
continue
|
||||
if token in self.added_tokens_encoder:
|
||||
if current_sub_text:
|
||||
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||
current_sub_text = []
|
||||
sub_texts.append(" " + token)
|
||||
else:
|
||||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||
text = ''.join(sub_texts)
|
||||
|
||||
if self._sep_token is not None and self._sep_token in text:
|
||||
text = text.replace(self._cls_token, self._sep_token)
|
||||
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
|
||||
if clean_up_tokenization_spaces:
|
||||
clean_text = [self.clean_up_tokenization(text) for text in split_text]
|
||||
return clean_text
|
||||
@@ -632,7 +802,7 @@ class PreTrainedTokenizer(object):
|
||||
all_toks = []
|
||||
set_attr = self.special_tokens_map
|
||||
for attr_value in set_attr.values():
|
||||
all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value])
|
||||
all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
|
||||
all_toks = list(set(all_toks))
|
||||
return all_toks
|
||||
|
||||
|
||||
@@ -20,8 +20,12 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
import sacremoses as sm
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
from .tokenization_bert import BasicTokenizer
|
||||
|
||||
@@ -43,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
|
||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
|
||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
|
||||
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
|
||||
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
|
||||
},
|
||||
'merges_file':
|
||||
{
|
||||
@@ -54,6 +60,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
|
||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
|
||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
|
||||
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
|
||||
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -66,6 +74,342 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'xlm-mlm-xnli15-1024': 512,
|
||||
'xlm-clm-enfr-1024': 512,
|
||||
'xlm-clm-ende-1024': 512,
|
||||
'xlm-mlm-17-1280': 512,
|
||||
'xlm-mlm-100-1280': 512,
|
||||
}
|
||||
|
||||
PRETRAINED_INIT_CONFIGURATION = {
|
||||
'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
|
||||
'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
|
||||
"id2lang": { "0": "de",
|
||||
"1": "en"},
|
||||
"lang2id": { "de": 0,
|
||||
"en": 1 }},
|
||||
'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
|
||||
"id2lang": { "0": "en",
|
||||
"1": "fr"},
|
||||
"lang2id": { "en": 0,
|
||||
"fr": 1 }},
|
||||
'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
|
||||
"id2lang": { "0": "en",
|
||||
"1": "ro"},
|
||||
"lang2id": { "en": 0,
|
||||
"ro": 1 }},
|
||||
'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
|
||||
"id2lang": { "0": "ar",
|
||||
"1": "bg",
|
||||
"2": "de",
|
||||
"3": "el",
|
||||
"4": "en",
|
||||
"5": "es",
|
||||
"6": "fr",
|
||||
"7": "hi",
|
||||
"8": "ru",
|
||||
"9": "sw",
|
||||
"10": "th",
|
||||
"11": "tr",
|
||||
"12": "ur",
|
||||
"13": "vi",
|
||||
"14": "zh"},
|
||||
"lang2id": { "ar": 0,
|
||||
"bg": 1,
|
||||
"de": 2,
|
||||
"el": 3,
|
||||
"en": 4,
|
||||
"es": 5,
|
||||
"fr": 6,
|
||||
"hi": 7,
|
||||
"ru": 8,
|
||||
"sw": 9,
|
||||
"th": 10,
|
||||
"tr": 11,
|
||||
"ur": 12,
|
||||
"vi": 13,
|
||||
"zh": 14 }},
|
||||
'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
|
||||
"id2lang": { "0": "ar",
|
||||
"1": "bg",
|
||||
"2": "de",
|
||||
"3": "el",
|
||||
"4": "en",
|
||||
"5": "es",
|
||||
"6": "fr",
|
||||
"7": "hi",
|
||||
"8": "ru",
|
||||
"9": "sw",
|
||||
"10": "th",
|
||||
"11": "tr",
|
||||
"12": "ur",
|
||||
"13": "vi",
|
||||
"14": "zh"},
|
||||
"lang2id": { "ar": 0,
|
||||
"bg": 1,
|
||||
"de": 2,
|
||||
"el": 3,
|
||||
"en": 4,
|
||||
"es": 5,
|
||||
"fr": 6,
|
||||
"hi": 7,
|
||||
"ru": 8,
|
||||
"sw": 9,
|
||||
"th": 10,
|
||||
"tr": 11,
|
||||
"ur": 12,
|
||||
"vi": 13,
|
||||
"zh": 14 }},
|
||||
'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
|
||||
"id2lang": { "0": "en",
|
||||
"1": "fr"},
|
||||
"lang2id": { "en": 0,
|
||||
"fr": 1 }},
|
||||
'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
|
||||
"id2lang": { "0": "de",
|
||||
"1": "en"},
|
||||
"lang2id": { "de": 0,
|
||||
"en": 1 }},
|
||||
'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
|
||||
"id2lang": {
|
||||
"0": "ar",
|
||||
"1": "de",
|
||||
"2": "en",
|
||||
"3": "es",
|
||||
"4": "fr",
|
||||
"5": "hi",
|
||||
"6": "it",
|
||||
"7": "ja",
|
||||
"8": "ko",
|
||||
"9": "nl",
|
||||
"10": "pl",
|
||||
"11": "pt",
|
||||
"12": "ru",
|
||||
"13": "sv",
|
||||
"14": "tr",
|
||||
"15": "vi",
|
||||
"16": "zh"
|
||||
},
|
||||
"lang2id": {
|
||||
"ar": 0,
|
||||
"de": 1,
|
||||
"en": 2,
|
||||
"es": 3,
|
||||
"fr": 4,
|
||||
"hi": 5,
|
||||
"it": 6,
|
||||
"ja": 7,
|
||||
"ko": 8,
|
||||
"nl": 9,
|
||||
"pl": 10,
|
||||
"pt": 11,
|
||||
"ru": 12,
|
||||
"sv": 13,
|
||||
"tr": 14,
|
||||
"vi": 15,
|
||||
"zh": 16}},
|
||||
'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
|
||||
"id2lang": {
|
||||
"0": "af",
|
||||
"1": "als",
|
||||
"2": "am",
|
||||
"3": "an",
|
||||
"4": "ang",
|
||||
"5": "ar",
|
||||
"6": "arz",
|
||||
"7": "ast",
|
||||
"8": "az",
|
||||
"9": "bar",
|
||||
"10": "be",
|
||||
"11": "bg",
|
||||
"12": "bn",
|
||||
"13": "br",
|
||||
"14": "bs",
|
||||
"15": "ca",
|
||||
"16": "ceb",
|
||||
"17": "ckb",
|
||||
"18": "cs",
|
||||
"19": "cy",
|
||||
"20": "da",
|
||||
"21": "de",
|
||||
"22": "el",
|
||||
"23": "en",
|
||||
"24": "eo",
|
||||
"25": "es",
|
||||
"26": "et",
|
||||
"27": "eu",
|
||||
"28": "fa",
|
||||
"29": "fi",
|
||||
"30": "fr",
|
||||
"31": "fy",
|
||||
"32": "ga",
|
||||
"33": "gan",
|
||||
"34": "gl",
|
||||
"35": "gu",
|
||||
"36": "he",
|
||||
"37": "hi",
|
||||
"38": "hr",
|
||||
"39": "hu",
|
||||
"40": "hy",
|
||||
"41": "ia",
|
||||
"42": "id",
|
||||
"43": "is",
|
||||
"44": "it",
|
||||
"45": "ja",
|
||||
"46": "jv",
|
||||
"47": "ka",
|
||||
"48": "kk",
|
||||
"49": "kn",
|
||||
"50": "ko",
|
||||
"51": "ku",
|
||||
"52": "la",
|
||||
"53": "lb",
|
||||
"54": "lt",
|
||||
"55": "lv",
|
||||
"56": "mk",
|
||||
"57": "ml",
|
||||
"58": "mn",
|
||||
"59": "mr",
|
||||
"60": "ms",
|
||||
"61": "my",
|
||||
"62": "nds",
|
||||
"63": "ne",
|
||||
"64": "nl",
|
||||
"65": "nn",
|
||||
"66": "no",
|
||||
"67": "oc",
|
||||
"68": "pl",
|
||||
"69": "pt",
|
||||
"70": "ro",
|
||||
"71": "ru",
|
||||
"72": "scn",
|
||||
"73": "sco",
|
||||
"74": "sh",
|
||||
"75": "si",
|
||||
"76": "simple",
|
||||
"77": "sk",
|
||||
"78": "sl",
|
||||
"79": "sq",
|
||||
"80": "sr",
|
||||
"81": "sv",
|
||||
"82": "sw",
|
||||
"83": "ta",
|
||||
"84": "te",
|
||||
"85": "th",
|
||||
"86": "tl",
|
||||
"87": "tr",
|
||||
"88": "tt",
|
||||
"89": "uk",
|
||||
"90": "ur",
|
||||
"91": "uz",
|
||||
"92": "vi",
|
||||
"93": "war",
|
||||
"94": "wuu",
|
||||
"95": "yi",
|
||||
"96": "zh",
|
||||
"97": "zh_classical",
|
||||
"98": "zh_min_nan",
|
||||
"99": "zh_yue"
|
||||
},
|
||||
"lang2id": {
|
||||
"af": 0,
|
||||
"als": 1,
|
||||
"am": 2,
|
||||
"an": 3,
|
||||
"ang": 4,
|
||||
"ar": 5,
|
||||
"arz": 6,
|
||||
"ast": 7,
|
||||
"az": 8,
|
||||
"bar": 9,
|
||||
"be": 10,
|
||||
"bg": 11,
|
||||
"bn": 12,
|
||||
"br": 13,
|
||||
"bs": 14,
|
||||
"ca": 15,
|
||||
"ceb": 16,
|
||||
"ckb": 17,
|
||||
"cs": 18,
|
||||
"cy": 19,
|
||||
"da": 20,
|
||||
"de": 21,
|
||||
"el": 22,
|
||||
"en": 23,
|
||||
"eo": 24,
|
||||
"es": 25,
|
||||
"et": 26,
|
||||
"eu": 27,
|
||||
"fa": 28,
|
||||
"fi": 29,
|
||||
"fr": 30,
|
||||
"fy": 31,
|
||||
"ga": 32,
|
||||
"gan": 33,
|
||||
"gl": 34,
|
||||
"gu": 35,
|
||||
"he": 36,
|
||||
"hi": 37,
|
||||
"hr": 38,
|
||||
"hu": 39,
|
||||
"hy": 40,
|
||||
"ia": 41,
|
||||
"id": 42,
|
||||
"is": 43,
|
||||
"it": 44,
|
||||
"ja": 45,
|
||||
"jv": 46,
|
||||
"ka": 47,
|
||||
"kk": 48,
|
||||
"kn": 49,
|
||||
"ko": 50,
|
||||
"ku": 51,
|
||||
"la": 52,
|
||||
"lb": 53,
|
||||
"lt": 54,
|
||||
"lv": 55,
|
||||
"mk": 56,
|
||||
"ml": 57,
|
||||
"mn": 58,
|
||||
"mr": 59,
|
||||
"ms": 60,
|
||||
"my": 61,
|
||||
"nds": 62,
|
||||
"ne": 63,
|
||||
"nl": 64,
|
||||
"nn": 65,
|
||||
"no": 66,
|
||||
"oc": 67,
|
||||
"pl": 68,
|
||||
"pt": 69,
|
||||
"ro": 70,
|
||||
"ru": 71,
|
||||
"scn": 72,
|
||||
"sco": 73,
|
||||
"sh": 74,
|
||||
"si": 75,
|
||||
"simple": 76,
|
||||
"sk": 77,
|
||||
"sl": 78,
|
||||
"sq": 79,
|
||||
"sr": 80,
|
||||
"sv": 81,
|
||||
"sw": 82,
|
||||
"ta": 83,
|
||||
"te": 84,
|
||||
"th": 85,
|
||||
"tl": 86,
|
||||
"tr": 87,
|
||||
"tt": 88,
|
||||
"uk": 89,
|
||||
"ur": 90,
|
||||
"uz": 91,
|
||||
"vi": 92,
|
||||
"war": 93,
|
||||
"wuu": 94,
|
||||
"yi": 95,
|
||||
"zh": 96,
|
||||
"zh_classical": 97,
|
||||
"zh_min_nan": 98,
|
||||
"zh_yue": 99
|
||||
}},
|
||||
}
|
||||
|
||||
def get_pairs(word):
|
||||
@@ -80,57 +424,145 @@ def get_pairs(word):
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
def text_standardize(text):
|
||||
|
||||
def lowercase_and_remove_accent(text):
|
||||
"""
|
||||
fixes some issues the spacy tokenizer had on books corpus
|
||||
also does some whitespace standardization
|
||||
Lowercase and strips accents from a piece of text based on
|
||||
https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
|
||||
"""
|
||||
text = text.replace('—', '-')
|
||||
text = text.replace('–', '-')
|
||||
text = text.replace('―', '-')
|
||||
text = ' '.join(text)
|
||||
text = text.lower()
|
||||
text = unicodedata.normalize("NFD", text)
|
||||
output = []
|
||||
for char in text:
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Mn":
|
||||
continue
|
||||
output.append(char)
|
||||
return "".join(output).lower().split(' ')
|
||||
|
||||
|
||||
def replace_unicode_punct(text):
|
||||
'''
|
||||
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
|
||||
'''
|
||||
text = text.replace(',', ',')
|
||||
text = re.sub(r'。\s*', '. ', text)
|
||||
text = text.replace('、', ',')
|
||||
text = text.replace('”', '"')
|
||||
text = text.replace('“', '"')
|
||||
text = text.replace('∶', ':')
|
||||
text = text.replace(':', ':')
|
||||
text = text.replace('?', '?')
|
||||
text = text.replace('《', '"')
|
||||
text = text.replace('》', '"')
|
||||
text = text.replace(')', ')')
|
||||
text = text.replace('!', '!')
|
||||
text = text.replace('(', '(')
|
||||
text = text.replace(';', ';')
|
||||
text = text.replace('1', '"')
|
||||
text = text.replace('」', '"')
|
||||
text = text.replace('「', '"')
|
||||
text = text.replace('0', '0')
|
||||
text = text.replace('3', '3')
|
||||
text = text.replace('2', '2')
|
||||
text = text.replace('5', '5')
|
||||
text = text.replace('6', '6')
|
||||
text = text.replace('9', '9')
|
||||
text = text.replace('7', '7')
|
||||
text = text.replace('8', '8')
|
||||
text = text.replace('4', '4')
|
||||
text = re.sub(r'.\s*', '. ', text)
|
||||
text = text.replace('~', '~')
|
||||
text = text.replace('’', '\'')
|
||||
text = text.replace('…', '...')
|
||||
text = text.replace('´', "'")
|
||||
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
|
||||
text = re.sub(r'\s*\n\s*', ' \n ', text)
|
||||
text = re.sub(r'[^\S\n]+', ' ', text)
|
||||
return text.strip()
|
||||
text = text.replace('━', '-')
|
||||
text = text.replace('〈', '<')
|
||||
text = text.replace('〉', '>')
|
||||
text = text.replace('【', '[')
|
||||
text = text.replace('】', ']')
|
||||
text = text.replace('%', '%')
|
||||
return text
|
||||
|
||||
|
||||
def remove_non_printing_char(text):
|
||||
'''
|
||||
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
|
||||
'''
|
||||
output = []
|
||||
for char in text:
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith('C'):
|
||||
continue
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
|
||||
def romanian_preprocessing(text):
|
||||
'''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
|
||||
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
|
||||
text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
|
||||
text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
|
||||
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
|
||||
text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
|
||||
text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
|
||||
text = text.replace("\u0102", "A").replace("\u0103", "a")
|
||||
text = text.replace("\u00C2", "A").replace("\u00E2", "a")
|
||||
text = text.replace("\u00CE", "I").replace("\u00EE", "i")
|
||||
return text
|
||||
|
||||
|
||||
class XLMTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
|
||||
BPE tokenizer for XLM
|
||||
|
||||
- lower case all inputs
|
||||
- Moses preprocessing & tokenization for most supported languages
|
||||
|
||||
- uses `SpaCy tokenizer <https://spacy.io/api/tokenizer/>`_ and \
|
||||
`ftfy <https://ftfy.readthedocs.io/en/latest/>`_ for pre-BPE tokenization if they are installed, \
|
||||
fallback to BERT's BasicTokenizer if not.
|
||||
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
|
||||
|
||||
- (optionally) lower case & normalize all inputs text
|
||||
|
||||
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
||||
(ex: "__classify__") to a vocabulary.
|
||||
(ex: "__classify__") to a vocabulary
|
||||
|
||||
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
|
||||
|
||||
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
|
||||
|
||||
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
|
||||
"""
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
|
||||
sep_token="</s>", pad_token="<pad>", cls_token="</s>",
|
||||
mask_token="<special1>", additional_special_tokens=["<special0>",
|
||||
"<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
|
||||
"<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
|
||||
"<special6>", "<special7>", "<special8>", "<special9>"],
|
||||
lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
|
||||
**kwargs):
|
||||
super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
|
||||
sep_token=sep_token, pad_token=pad_token,
|
||||
cls_token=cls_token, mask_token=mask_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs)
|
||||
try:
|
||||
import ftfy
|
||||
import spacy
|
||||
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
|
||||
self.fix_text = ftfy.fix_text
|
||||
except ImportError:
|
||||
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
|
||||
self.nlp = BasicTokenizer(do_lower_case=True)
|
||||
self.fix_text = None
|
||||
|
||||
# cache of sm.MosesPunctNormalizer instance
|
||||
self.cache_moses_punct_normalizer = dict()
|
||||
# cache of sm.MosesTokenizer instance
|
||||
self.cache_moses_tokenizer = dict()
|
||||
self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
|
||||
# True for current supported model (v1.2.0), False for XLM-17 & 100
|
||||
self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
|
||||
self.lang2id = lang2id
|
||||
self.id2lang = id2lang
|
||||
if lang2id is not None and id2lang is not None:
|
||||
assert len(lang2id) == len(id2lang)
|
||||
|
||||
self.ja_word_tokenizer = None
|
||||
self.zh_word_tokenizer = None
|
||||
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
@@ -139,6 +571,43 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
def moses_punct_norm(self, text, lang):
|
||||
if lang not in self.cache_moses_punct_normalizer:
|
||||
punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
|
||||
self.cache_moses_punct_normalizer[lang] = punct_normalizer
|
||||
else:
|
||||
punct_normalizer = self.cache_moses_punct_normalizer[lang]
|
||||
return punct_normalizer.normalize(text)
|
||||
|
||||
def moses_tokenize(self, text, lang):
|
||||
if lang not in self.cache_moses_tokenizer:
|
||||
moses_tokenizer = sm.MosesTokenizer(lang=lang)
|
||||
self.cache_moses_tokenizer[lang] = moses_tokenizer
|
||||
else:
|
||||
moses_tokenizer = self.cache_moses_tokenizer[lang]
|
||||
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
|
||||
|
||||
def moses_pipeline(self, text, lang):
|
||||
text = replace_unicode_punct(text)
|
||||
text = self.moses_punct_norm(text, lang)
|
||||
text = remove_non_printing_char(text)
|
||||
return text
|
||||
|
||||
def ja_tokenize(self, text):
|
||||
if self.ja_word_tokenizer is None:
|
||||
try:
|
||||
import Mykytea
|
||||
self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
|
||||
except (AttributeError, ImportError) as e:
|
||||
logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
|
||||
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
|
||||
logger.error("2. autoreconf -i")
|
||||
logger.error("3. ./configure --prefix=$HOME/local")
|
||||
logger.error("4. make && make install")
|
||||
logger.error("5. pip install kytea")
|
||||
raise e
|
||||
return list(self.ja_word_tokenizer.getWS(text))
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.encoder)
|
||||
@@ -186,19 +655,90 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def _tokenize(self, text):
|
||||
""" Tokenize a string. """
|
||||
split_tokens = []
|
||||
if self.fix_text is None:
|
||||
# Using BERT's BasicTokenizer
|
||||
text = self.nlp.tokenize(text)
|
||||
for token in text:
|
||||
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
||||
def _tokenize(self, text, lang='en', bypass_tokenizer=False):
|
||||
"""
|
||||
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
|
||||
|
||||
Details of tokenization:
|
||||
- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
|
||||
- Install with `pip install sacremoses`
|
||||
- [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
|
||||
- Install with `pip install pythainlp`
|
||||
- [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
|
||||
- Install with the following steps:
|
||||
```
|
||||
git clone git@github.com:neubig/kytea.git && cd kytea
|
||||
autoreconf -i
|
||||
./configure --prefix=$HOME/local
|
||||
make && make install
|
||||
pip install kytea
|
||||
```
|
||||
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
|
||||
- Install with `pip install jieba`
|
||||
|
||||
\* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
|
||||
However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
|
||||
Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
|
||||
if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
|
||||
[preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
|
||||
and set `bypass_tokenizer=True` to bypass the tokenizer.
|
||||
|
||||
Args:
|
||||
- lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
|
||||
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE.
|
||||
|
||||
Returns:
|
||||
List of tokens.
|
||||
"""
|
||||
if lang and self.lang2id and lang not in self.lang2id:
|
||||
logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
|
||||
if bypass_tokenizer:
|
||||
text = text.split()
|
||||
elif lang not in self.lang_with_custom_tokenizer:
|
||||
text = self.moses_pipeline(text, lang=lang)
|
||||
# TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
|
||||
if lang == 'ro':
|
||||
text = romanian_preprocessing(text)
|
||||
text = self.moses_tokenize(text, lang=lang)
|
||||
elif lang == 'th':
|
||||
text = self.moses_pipeline(text, lang=lang)
|
||||
try:
|
||||
if 'pythainlp' not in sys.modules:
|
||||
from pythainlp.tokenize import word_tokenize as th_word_tokenize
|
||||
else:
|
||||
th_word_tokenize = sys.modules['pythainlp'].word_tokenize
|
||||
except (AttributeError, ImportError) as e:
|
||||
logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
|
||||
logger.error("1. pip install pythainlp")
|
||||
raise e
|
||||
text = th_word_tokenize(text)
|
||||
elif lang == 'zh':
|
||||
try:
|
||||
if 'jieba' not in sys.modules:
|
||||
import jieba
|
||||
else:
|
||||
jieba = sys.modules['jieba']
|
||||
except (AttributeError, ImportError) as e:
|
||||
logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
|
||||
logger.error("1. pip install jieba")
|
||||
raise e
|
||||
text = ' '.join(jieba.cut(text))
|
||||
text = self.moses_pipeline(text, lang=lang)
|
||||
text = text.split()
|
||||
elif lang == 'ja':
|
||||
text = self.moses_pipeline(text, lang=lang)
|
||||
text = self.ja_tokenize(text)
|
||||
else:
|
||||
# Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
|
||||
text = self.nlp(text_standardize(self.fix_text(text)))
|
||||
for token in text:
|
||||
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
|
||||
raise ValueError('It should not reach here')
|
||||
|
||||
if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
|
||||
text = lowercase_and_remove_accent(text)
|
||||
|
||||
split_tokens = []
|
||||
for token in text:
|
||||
if token:
|
||||
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
||||
|
||||
return split_tokens
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
@@ -219,15 +759,15 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
Adds special tokens to a sequence for sequence classification tasks.
|
||||
An XLM sequence has the following format: [CLS] X [SEP]
|
||||
"""
|
||||
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
|
||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
||||
|
||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||
"""
|
||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
||||
"""
|
||||
sep = [self._convert_token_to_id(self.sep_token)]
|
||||
cls = [self._convert_token_to_id(self.cls_token)]
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
|
||||
@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(self, vocab_file, max_len=None,
|
||||
def __init__(self, vocab_file,
|
||||
do_lower_case=False, remove_space=True, keep_accents=False,
|
||||
bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
|
||||
pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
|
||||
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
pad_token=pad_token, cls_token=cls_token,
|
||||
mask_token=mask_token, additional_special_tokens=
|
||||
additional_special_tokens, **kwargs)
|
||||
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
except ImportError:
|
||||
@@ -182,8 +186,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
|
||||
"""
|
||||
sep = [self._convert_token_to_id(self.sep_token)]
|
||||
cls = [self._convert_token_to_id(self.cls_token)]
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
return token_ids + sep + cls
|
||||
|
||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||
@@ -191,8 +195,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
Adds special tokens to a sequence for sequence classification tasks.
|
||||
An XLNet sequence has the following format: X [SEP][CLS]
|
||||
"""
|
||||
sep = [self._convert_token_to_id(self.sep_token)]
|
||||
cls = [self._convert_token_to_id(self.cls_token)]
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
|
||||
@@ -9,4 +9,6 @@ requests
|
||||
# For OpenAI GPT
|
||||
regex
|
||||
# For XLNet
|
||||
sentencepiece
|
||||
sentencepiece
|
||||
# For XLM
|
||||
sacremoses
|
||||
5
setup.py
5
setup.py
@@ -38,7 +38,7 @@ from setuptools import find_packages, setup
|
||||
|
||||
setup(
|
||||
name="pytorch_transformers",
|
||||
version="1.1.0",
|
||||
version="1.2.0",
|
||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
|
||||
author_email="thomas@huggingface.co",
|
||||
description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
|
||||
@@ -55,7 +55,8 @@ setup(
|
||||
'requests',
|
||||
'tqdm',
|
||||
'regex',
|
||||
'sentencepiece'],
|
||||
'sentencepiece',
|
||||
'sacremoses'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
"pytorch_transformers=pytorch_transformers.__main__:main",
|
||||
|
||||
Reference in New Issue
Block a user