Merge remote-tracking branch 'huggingface/master'
# Conflicts: # pytorch_transformers/__init__.py
This commit is contained in:
@@ -4,8 +4,8 @@ jobs:
|
|||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/pytorch-transformers
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:3.5
|
- image: circleci/python:3.5
|
||||||
resource_class: large
|
resource_class: xlarge
|
||||||
parallelism: 4
|
parallelism: 1
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
- run: sudo pip install --progress-bar off .
|
- run: sudo pip install --progress-bar off .
|
||||||
@@ -17,7 +17,7 @@ jobs:
|
|||||||
build_py2:
|
build_py2:
|
||||||
working_directory: ~/pytorch-transformers
|
working_directory: ~/pytorch-transformers
|
||||||
resource_class: large
|
resource_class: large
|
||||||
parallelism: 4
|
parallelism: 1
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:2.7
|
- image: circleci/python:2.7
|
||||||
steps:
|
steps:
|
||||||
@@ -26,9 +26,28 @@ jobs:
|
|||||||
- run: sudo pip install pytest codecov pytest-cov
|
- run: sudo pip install pytest codecov pytest-cov
|
||||||
- run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
|
- run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
|
||||||
- run: codecov
|
- run: codecov
|
||||||
|
deploy_doc:
|
||||||
|
working_directory: ~/pytorch-transformers
|
||||||
|
docker:
|
||||||
|
- image: circleci/python:3.5
|
||||||
|
steps:
|
||||||
|
- add_ssh_keys:
|
||||||
|
fingerprints:
|
||||||
|
- "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
|
||||||
|
- checkout
|
||||||
|
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||||
|
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||||
|
- run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
|
||||||
|
- run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
|
||||||
|
workflow_filters: &workflow_filters
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- master
|
||||||
workflows:
|
workflows:
|
||||||
version: 2
|
version: 2
|
||||||
build_and_test:
|
build_and_test:
|
||||||
jobs:
|
jobs:
|
||||||
- build_py3
|
- build_py3
|
||||||
- build_py2
|
- build_py2
|
||||||
|
- deploy_doc: *workflow_filters
|
||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -130,4 +130,5 @@ runs
|
|||||||
examples/runs
|
examples/runs
|
||||||
|
|
||||||
# data
|
# data
|
||||||
data
|
data
|
||||||
|
serialization_dir
|
||||||
11
README.md
11
README.md
@@ -21,6 +21,7 @@ These implementations have been tested on several datasets (see the example scri
|
|||||||
| Section | Description |
|
| Section | Description |
|
||||||
|-|-|
|
|-|-|
|
||||||
| [Installation](#installation) | How to install the package |
|
| [Installation](#installation) | How to install the package |
|
||||||
|
| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
|
||||||
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
||||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
|
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
|
||||||
@@ -68,6 +69,14 @@ It contains an example of a conversion script from a Pytorch trained Transformer
|
|||||||
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
|
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
|
||||||
or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
|
or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
|
||||||
|
|
||||||
|
## Online demo
|
||||||
|
|
||||||
|
**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
|
||||||
|
You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.
|
||||||
|
|
||||||
|
> “🦄 Write with transformer is to writing what calculators are to calculus.”
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
## Quick tour
|
## Quick tour
|
||||||
|
|
||||||
@@ -95,7 +104,7 @@ for model_class, tokenizer_class, pretrained_weights in MODELS:
|
|||||||
model = model_class.from_pretrained(pretrained_weights)
|
model = model_class.from_pretrained(pretrained_weights)
|
||||||
|
|
||||||
# Encode text
|
# Encode text
|
||||||
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
|
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)]) # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
|
last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,13 @@ pip install recommonmark
|
|||||||
|
|
||||||
## Building the documentation
|
## Building the documentation
|
||||||
|
|
||||||
|
Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig
|
||||||
|
command to generate it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ln -s ../../examples/README.md source/examples.md
|
||||||
|
```
|
||||||
|
|
||||||
Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
|
Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -26,3 +26,4 @@ sphinxcontrib-jsmath==1.0.1
|
|||||||
sphinxcontrib-qthelp==1.0.2
|
sphinxcontrib-qthelp==1.0.2
|
||||||
sphinxcontrib-serializinghtml==1.1.3
|
sphinxcontrib-serializinghtml==1.1.3
|
||||||
urllib3==1.25.3
|
urllib3==1.25.3
|
||||||
|
sphinx-markdown-tables==0.0.9
|
||||||
@@ -26,7 +26,7 @@ author = u'huggingface'
|
|||||||
# The short X.Y version
|
# The short X.Y version
|
||||||
version = u''
|
version = u''
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = u'1.0.0'
|
release = u'1.2.0'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
@@ -43,7 +43,8 @@ extensions = [
|
|||||||
'sphinx.ext.coverage',
|
'sphinx.ext.coverage',
|
||||||
'sphinx.ext.napoleon',
|
'sphinx.ext.napoleon',
|
||||||
'recommonmark',
|
'recommonmark',
|
||||||
'sphinx.ext.viewcode'
|
'sphinx.ext.viewcode',
|
||||||
|
'sphinx_markdown_tables'
|
||||||
]
|
]
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
|
|||||||
@@ -1,682 +0,0 @@
|
|||||||
examples.rst
|
|
||||||
|
|
||||||
Examples
|
|
||||||
================================================
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Sub-section
|
|
||||||
- Description
|
|
||||||
* - `Training large models: introduction, tools and examples <#introduction>`_
|
|
||||||
- How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
|
|
||||||
* - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_
|
|
||||||
- Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
|
|
||||||
* - `Fine-tuning with OpenAI GPT, Transformer-XL, GPT-2 as well as BERT and RoBERTa <#fine-tuning>`_
|
|
||||||
- Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py``, ``run_gpt2.py`` and ``run_lm_finetuning.py``
|
|
||||||
* - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_
|
|
||||||
- How to fine tune ``BERT large``
|
|
||||||
|
|
||||||
|
|
||||||
.. _introduction:
|
|
||||||
|
|
||||||
Training large models: introduction, tools and examples
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
|
|
||||||
|
|
||||||
To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_ that I published earlier this year.
|
|
||||||
|
|
||||||
Here is how to use these techniques in our scripts:
|
|
||||||
|
|
||||||
|
|
||||||
* **Gradient Accumulation**\ : Gradient accumulation can be used by supplying a integer greater than 1 to the ``--gradient_accumulation_steps`` argument. The batch at each step will be divided by this integer and gradient will be accumulated over ``gradient_accumulation_steps`` steps.
|
|
||||||
* **Multi-GPU**\ : Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
|
|
||||||
* **Distributed training**\ : Distributed training can be activated by supplying an integer greater or equal to 0 to the ``--local_rank`` argument (see below).
|
|
||||||
* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here <https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/>`__ and a full documentation is `here <https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
|
|
||||||
|
|
||||||
To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`__. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
|
|
||||||
|
|
||||||
Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch \
|
|
||||||
--nproc_per_node=4 \
|
|
||||||
--nnodes=2 \
|
|
||||||
--node_rank=$THIS_MACHINE_INDEX \
|
|
||||||
--master_addr="192.168.1.1" \
|
|
||||||
--master_port=1234 run_bert_classifier.py \
|
|
||||||
(--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
|
|
||||||
|
|
||||||
Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
|
|
||||||
|
|
||||||
.. _fine-tuning-bert-examples:
|
|
||||||
|
|
||||||
Fine-tuning with BERT: running the examples
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
We showcase several fine-tuning examples based on (and extended from) `the original implementation <https://github.com/google-research/bert/>`_\ :
|
|
||||||
|
|
||||||
|
|
||||||
* a *sequence-level classifier* on nine different GLUE tasks,
|
|
||||||
* a *token-level classifier* on the question answering dataset SQuAD, and
|
|
||||||
* a *sequence-level multiple-choice classifier* on the SWAG classification corpus.
|
|
||||||
* a *BERT language model* on another target corpus
|
|
||||||
|
|
||||||
GLUE results on dev set
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
We get the following results on the dev set of GLUE benchmark with an uncased BERT base
|
|
||||||
model (`bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train batch size of 24. Some of
|
|
||||||
these tasks have a small dataset and training can lead to high variance in the results between different runs.
|
|
||||||
We report the median on 5 runs (with different seeds) for each of the metrics.
|
|
||||||
|
|
||||||
.. list-table::
|
|
||||||
:header-rows: 1
|
|
||||||
|
|
||||||
* - Task
|
|
||||||
- Metric
|
|
||||||
- Result
|
|
||||||
* - CoLA
|
|
||||||
- Matthew's corr.
|
|
||||||
- 55.75
|
|
||||||
* - SST-2
|
|
||||||
- accuracy
|
|
||||||
- 92.09
|
|
||||||
* - MRPC
|
|
||||||
- F1/accuracy
|
|
||||||
- 90.48/86.27
|
|
||||||
* - STS-B
|
|
||||||
- Pearson/Spearman corr.
|
|
||||||
- 89.03/88.64
|
|
||||||
* - QQP
|
|
||||||
- accuracy/F1
|
|
||||||
- 90.92/87.72
|
|
||||||
* - MNLI
|
|
||||||
- matched acc./mismatched acc.
|
|
||||||
- 83.74/84.06
|
|
||||||
* - QNLI
|
|
||||||
- accuracy
|
|
||||||
- 91.07
|
|
||||||
* - RTE
|
|
||||||
- accuracy
|
|
||||||
- 68.59
|
|
||||||
* - WNLI
|
|
||||||
- accuracy
|
|
||||||
- 43.66
|
|
||||||
|
|
||||||
|
|
||||||
Some of these results are significantly different from the ones reported on the test set
|
|
||||||
of GLUE benchmark on the website. For QQP and WNLI, please refer to `FAQ #12 <https://gluebenchmark.com/faq>`_ on the webite.
|
|
||||||
|
|
||||||
Before running anyone of these GLUE tasks you should download the
|
|
||||||
`GLUE data <https://gluebenchmark.com/tasks>`_ by running
|
|
||||||
`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
|
|
||||||
and unpack it to some directory ``$GLUE_DIR``.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export GLUE_DIR=/path/to/glue
|
|
||||||
export TASK_NAME=MRPC
|
|
||||||
|
|
||||||
python run_bert_classifier.py \
|
|
||||||
--task_name $TASK_NAME \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--do_lower_case \
|
|
||||||
--data_dir $GLUE_DIR/$TASK_NAME \
|
|
||||||
--bert_model bert-base-uncased \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 32 \
|
|
||||||
--learning_rate 2e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir /tmp/$TASK_NAME/
|
|
||||||
|
|
||||||
where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
|
|
||||||
|
|
||||||
The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
|
|
||||||
|
|
||||||
The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being said, there shouldn't be any issues in running half-precision training with the remaining GLUE tasks as well, since the data processor for each task inherits from the base class DataProcessor.
|
|
||||||
|
|
||||||
MRPC
|
|
||||||
~~~~
|
|
||||||
|
|
||||||
This example code fine-tunes BERT on the Microsoft Research Paraphrase
|
|
||||||
Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
|
|
||||||
|
|
||||||
Before running this example you should download the
|
|
||||||
`GLUE data <https://gluebenchmark.com/tasks>`_ by running
|
|
||||||
`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
|
|
||||||
and unpack it to some directory ``$GLUE_DIR``.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export GLUE_DIR=/path/to/glue
|
|
||||||
|
|
||||||
python run_bert_classifier.py \
|
|
||||||
--task_name MRPC \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--do_lower_case \
|
|
||||||
--data_dir $GLUE_DIR/MRPC/ \
|
|
||||||
--bert_model bert-base-uncased \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 32 \
|
|
||||||
--learning_rate 2e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir /tmp/mrpc_output/
|
|
||||||
|
|
||||||
Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks>`__ gave evaluation results between 84% and 88%.
|
|
||||||
|
|
||||||
**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
|
|
||||||
First install apex as indicated `here <https://github.com/NVIDIA/apex>`__.
|
|
||||||
Then run
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export GLUE_DIR=/path/to/glue
|
|
||||||
|
|
||||||
python run_bert_classifier.py \
|
|
||||||
--task_name MRPC \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--do_lower_case \
|
|
||||||
--data_dir $GLUE_DIR/MRPC/ \
|
|
||||||
--bert_model bert-base-uncased \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 32 \
|
|
||||||
--learning_rate 2e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir /tmp/mrpc_output/ \
|
|
||||||
--fp16
|
|
||||||
|
|
||||||
**Distributed training**
|
|
||||||
Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch \
|
|
||||||
--nproc_per_node 8 run_bert_classifier.py \
|
|
||||||
--bert_model bert-large-uncased-whole-word-masking \
|
|
||||||
--task_name MRPC \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--do_lower_case \
|
|
||||||
--data_dir $GLUE_DIR/MRPC/ \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 8 \
|
|
||||||
--learning_rate 2e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir /tmp/mrpc_output/
|
|
||||||
|
|
||||||
Training with these hyper-parameters gave us the following results:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
acc = 0.8823529411764706
|
|
||||||
acc_and_f1 = 0.901702786377709
|
|
||||||
eval_loss = 0.3418912578906332
|
|
||||||
f1 = 0.9210526315789473
|
|
||||||
global_step = 174
|
|
||||||
loss = 0.07231863956341798
|
|
||||||
|
|
||||||
Here is an example on MNLI:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch \
|
|
||||||
--nproc_per_node 8 run_bert_classifier.py \
|
|
||||||
--bert_model bert-large-uncased-whole-word-masking \
|
|
||||||
--task_name mnli \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--do_lower_case \
|
|
||||||
--data_dir /datadrive/bert_data/glue_data//MNLI/ \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 8 \
|
|
||||||
--learning_rate 2e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir ../models/wwm-uncased-finetuned-mnli/ \
|
|
||||||
--overwrite_output_dir
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
***** Eval results *****
|
|
||||||
acc = 0.8679706601466992
|
|
||||||
eval_loss = 0.4911287787382479
|
|
||||||
global_step = 18408
|
|
||||||
loss = 0.04755385363816904
|
|
||||||
|
|
||||||
***** Eval results *****
|
|
||||||
acc = 0.8747965825874695
|
|
||||||
eval_loss = 0.45516540421714036
|
|
||||||
global_step = 18408
|
|
||||||
loss = 0.04755385363816904
|
|
||||||
|
|
||||||
This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model
|
|
||||||
|
|
||||||
SQuAD
|
|
||||||
~~~~~
|
|
||||||
|
|
||||||
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
|
|
||||||
|
|
||||||
The data for SQuAD can be downloaded with the following links and should be saved in a ``$SQUAD_DIR`` directory.
|
|
||||||
|
|
||||||
|
|
||||||
* `train-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json>`_
|
|
||||||
* `dev-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json>`_
|
|
||||||
* `evaluate-v1.1.py <https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py>`_
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export SQUAD_DIR=/path/to/SQUAD
|
|
||||||
|
|
||||||
python run_bert_squad.py \
|
|
||||||
--bert_model bert-base-uncased \
|
|
||||||
--do_train \
|
|
||||||
--do_predict \
|
|
||||||
--do_lower_case \
|
|
||||||
--train_file $SQUAD_DIR/train-v1.1.json \
|
|
||||||
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
|
||||||
--train_batch_size 12 \
|
|
||||||
--learning_rate 3e-5 \
|
|
||||||
--num_train_epochs 2.0 \
|
|
||||||
--max_seq_length 384 \
|
|
||||||
--doc_stride 128 \
|
|
||||||
--output_dir /tmp/debug_squad/
|
|
||||||
|
|
||||||
Training with the previous hyper-parameters gave us the following results:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
|
|
||||||
{"f1": 88.52381567990474, "exact_match": 81.22043519394512}
|
|
||||||
|
|
||||||
**distributed training**
|
|
||||||
|
|
||||||
Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch --nproc_per_node=8 \
|
|
||||||
run_bert_squad.py \
|
|
||||||
--bert_model bert-large-uncased-whole-word-masking \
|
|
||||||
--do_train \
|
|
||||||
--do_predict \
|
|
||||||
--do_lower_case \
|
|
||||||
--train_file $SQUAD_DIR/train-v1.1.json \
|
|
||||||
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
|
||||||
--learning_rate 3e-5 \
|
|
||||||
--num_train_epochs 2 \
|
|
||||||
--max_seq_length 384 \
|
|
||||||
--doc_stride 128 \
|
|
||||||
--output_dir ../models/wwm_uncased_finetuned_squad/ \
|
|
||||||
--train_batch_size 24 \
|
|
||||||
--gradient_accumulation_steps 12
|
|
||||||
|
|
||||||
Training with these hyper-parameters gave us the following results:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
|
|
||||||
{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
|
|
||||||
|
|
||||||
This is the model provided as ``bert-large-uncased-whole-word-masking-finetuned-squad``.
|
|
||||||
|
|
||||||
And here is the model provided as ``bert-large-cased-whole-word-masking-finetuned-squad``\ :
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch --nproc_per_node=8 run_bert_squad.py \
|
|
||||||
--bert_model bert-large-cased-whole-word-masking \
|
|
||||||
--do_train \
|
|
||||||
--do_predict \
|
|
||||||
--do_lower_case \
|
|
||||||
--train_file $SQUAD_DIR/train-v1.1.json \
|
|
||||||
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
|
||||||
--learning_rate 3e-5 \
|
|
||||||
--num_train_epochs 2 \
|
|
||||||
--max_seq_length 384 \
|
|
||||||
--doc_stride 128 \
|
|
||||||
--output_dir ../models/wwm_cased_finetuned_squad/ \
|
|
||||||
--train_batch_size 24 \
|
|
||||||
--gradient_accumulation_steps 12
|
|
||||||
|
|
||||||
Training with these hyper-parameters gave us the following results:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
|
|
||||||
{"exact_match": 84.18164616840113, "f1": 91.58645594850135}
|
|
||||||
|
|
||||||
SWAG
|
|
||||||
~~~~
|
|
||||||
|
|
||||||
The data for SWAG can be downloaded by cloning the following `repository <https://github.com/rowanz/swagaf>`_
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export SWAG_DIR=/path/to/SWAG
|
|
||||||
|
|
||||||
python run_bert_swag.py \
|
|
||||||
--bert_model bert-base-uncased \
|
|
||||||
--do_train \
|
|
||||||
--do_lower_case \
|
|
||||||
--do_eval \
|
|
||||||
--data_dir $SWAG_DIR/data \
|
|
||||||
--train_batch_size 16 \
|
|
||||||
--learning_rate 2e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--max_seq_length 80 \
|
|
||||||
--output_dir /tmp/swag_output/ \
|
|
||||||
--gradient_accumulation_steps 4
|
|
||||||
|
|
||||||
Training with the previous hyper-parameters on a single GPU gave us the following results:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
eval_accuracy = 0.8062081375587323
|
|
||||||
eval_loss = 0.5966546792367169
|
|
||||||
global_step = 13788
|
|
||||||
loss = 0.06423990014260186
|
|
||||||
|
|
||||||
LM Fine-tuning
|
|
||||||
~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
The data should be a text file in the same format as `sample_text.txt <./pytorch_transformers/tests/fixtures/sample_text.txt/sample_text.txt>`_ (one sentence per line, docs separated by empty line).
|
|
||||||
You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and split into ~500k sentences with spaCy.
|
|
||||||
Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ :
|
|
||||||
|
|
||||||
Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `README <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/README.md>`_ of the `examples/lm_finetuning/ <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/>`_ folder.
|
|
||||||
|
|
||||||
.. _fine-tuning:
|
|
||||||
|
|
||||||
OpenAI GPT, Transformer-XL and GPT-2: running the examples
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
We provide three examples of scripts for OpenAI GPT, Transformer-XL, OpenAI GPT-2, BERT and RoBERTa based on (and extended from) the respective original implementations:
|
|
||||||
|
|
||||||
|
|
||||||
* fine-tuning OpenAI GPT on the ROCStories dataset
|
|
||||||
* evaluating Transformer-XL on Wikitext 103
|
|
||||||
* unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
|
|
||||||
* fine-tuning GPT/GPT-2 on a causal language modeling task and BERT/RoBERTa on a masked language modeling task
|
|
||||||
|
|
||||||
Fine-tuning OpenAI GPT on the RocStories dataset
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This example code fine-tunes OpenAI GPT on the RocStories dataset.
|
|
||||||
|
|
||||||
Before running this example you should download the
|
|
||||||
`RocStories dataset <https://github.com/snigdhac/StoryComprehension_EMNLP/tree/master/Dataset/RoCStories>`_ and unpack it to some directory ``$ROC_STORIES_DIR``.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export ROC_STORIES_DIR=/path/to/RocStories
|
|
||||||
|
|
||||||
python run_openai_gpt.py \
|
|
||||||
--model_name openai-gpt \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
|
|
||||||
--eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
|
|
||||||
--output_dir ../log \
|
|
||||||
--train_batch_size 16 \
|
|
||||||
|
|
||||||
This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 87.7% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
|
|
||||||
|
|
||||||
Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset.
|
|
||||||
This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
python run_transfo_xl.py --work_dir ../log
|
|
||||||
|
|
||||||
This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code).
|
|
||||||
|
|
||||||
Unconditional and conditional generation from OpenAI's GPT-2 model
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This example code is identical to the original unconditional and conditional generation codes.
|
|
||||||
|
|
||||||
Conditional generation:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
python run_gpt2.py
|
|
||||||
|
|
||||||
Unconditional generation:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
python run_gpt2.py --unconditional
|
|
||||||
|
|
||||||
The same option as in the original scripts are provided, please refer to the code of the example and the original repository of OpenAI.
|
|
||||||
|
|
||||||
|
|
||||||
Causal LM fine-tuning on GPT/GPT-2, Masked LM fine-tuning on BERT/RoBERTa
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Before running the following examples you should download the `WikiText-2 dataset <https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/>`__ and unpack it to some directory `$WIKITEXT_2_DATASET`
|
|
||||||
The following results were obtained using the `raw` WikiText-2 (no tokens were replaced before the tokenization).
|
|
||||||
|
|
||||||
This example fine-tunes GPT-2 on the WikiText-2 dataset. The loss function is a causal language modeling loss (perplexity).
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
|
|
||||||
|
|
||||||
python run_lm_finetuning.py
|
|
||||||
--output_dir=output
|
|
||||||
--model_type=gpt2
|
|
||||||
--model_name_or_path=gpt2
|
|
||||||
--do_train
|
|
||||||
--train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
|
|
||||||
--do_eval
|
|
||||||
--eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
|
|
||||||
|
|
||||||
This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run.
|
|
||||||
It reaches a score of about 20 perplexity once fine-tuned on the dataset.
|
|
||||||
|
|
||||||
This example fine-tunes RoBERTa on the WikiText-2 dataset. The loss function is a masked language modeling loss (masked perplexity).
|
|
||||||
The `--mlm` flag is necessary to fine-tune BERT/RoBERTa on masked language modeling.
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
|
|
||||||
|
|
||||||
python run_lm_finetuning.py
|
|
||||||
--output_dir=output
|
|
||||||
--model_type=roberta
|
|
||||||
--model_name_or_path=roberta-base
|
|
||||||
--do_train
|
|
||||||
--train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
|
|
||||||
--do_eval
|
|
||||||
--eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
|
|
||||||
--mlm
|
|
||||||
|
|
||||||
.. _fine-tuning-BERT-large:
|
|
||||||
|
|
||||||
Fine-tuning BERT-large on GPUs
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
|
|
||||||
|
|
||||||
For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
{"exact_match": 84.56953642384106, "f1": 91.04028647786927}
|
|
||||||
|
|
||||||
To get these results we used a combination of:
|
|
||||||
|
|
||||||
|
|
||||||
* multi-GPU training (automatically activated on a multi-GPU server),
|
|
||||||
* 2 steps of gradient accumulation and
|
|
||||||
* perform the optimization step on CPU to store Adam's averages in RAM.
|
|
||||||
|
|
||||||
Here is the full list of hyper-parameters for this run:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
export SQUAD_DIR=/path/to/SQUAD
|
|
||||||
|
|
||||||
python ./run_bert_squad.py \
|
|
||||||
--bert_model bert-large-uncased \
|
|
||||||
--do_train \
|
|
||||||
--do_predict \
|
|
||||||
--do_lower_case \
|
|
||||||
--train_file $SQUAD_DIR/train-v1.1.json \
|
|
||||||
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
|
||||||
--learning_rate 3e-5 \
|
|
||||||
--num_train_epochs 2 \
|
|
||||||
--max_seq_length 384 \
|
|
||||||
--doc_stride 128 \
|
|
||||||
--output_dir /tmp/debug_squad/ \
|
|
||||||
--train_batch_size 24 \
|
|
||||||
--gradient_accumulation_steps 2
|
|
||||||
|
|
||||||
If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
|
|
||||||
|
|
||||||
Here is an example of hyper-parameters for a FP16 run we tried:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
export SQUAD_DIR=/path/to/SQUAD
|
|
||||||
|
|
||||||
python ./run_bert_squad.py \
|
|
||||||
--bert_model bert-large-uncased \
|
|
||||||
--do_train \
|
|
||||||
--do_predict \
|
|
||||||
--do_lower_case \
|
|
||||||
--train_file $SQUAD_DIR/train-v1.1.json \
|
|
||||||
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
|
||||||
--learning_rate 3e-5 \
|
|
||||||
--num_train_epochs 2 \
|
|
||||||
--max_seq_length 384 \
|
|
||||||
--doc_stride 128 \
|
|
||||||
--output_dir /tmp/debug_squad/ \
|
|
||||||
--train_batch_size 24 \
|
|
||||||
--fp16 \
|
|
||||||
--loss_scale 128
|
|
||||||
|
|
||||||
The results were similar to the above FP32 results (actually slightly higher):
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
{"exact_match": 84.65468306527909, "f1": 91.238669287002}
|
|
||||||
|
|
||||||
Here is an example with the recent ``bert-large-uncased-whole-word-masking``\ :
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch --nproc_per_node=8 \
|
|
||||||
run_bert_squad.py \
|
|
||||||
--bert_model bert-large-uncased-whole-word-masking \
|
|
||||||
--do_train \
|
|
||||||
--do_predict \
|
|
||||||
--do_lower_case \
|
|
||||||
--train_file $SQUAD_DIR/train-v1.1.json \
|
|
||||||
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
|
||||||
--learning_rate 3e-5 \
|
|
||||||
--num_train_epochs 2 \
|
|
||||||
--max_seq_length 384 \
|
|
||||||
--doc_stride 128 \
|
|
||||||
--output_dir /tmp/debug_squad/ \
|
|
||||||
--train_batch_size 24 \
|
|
||||||
--gradient_accumulation_steps 2
|
|
||||||
|
|
||||||
Fine-tuning XLNet
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
STS-B
|
|
||||||
~~~~~
|
|
||||||
|
|
||||||
This example code fine-tunes XLNet on the STS-B corpus.
|
|
||||||
|
|
||||||
Before running this example you should download the
|
|
||||||
`GLUE data <https://gluebenchmark.com/tasks>`_ by running
|
|
||||||
`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
|
|
||||||
and unpack it to some directory ``$GLUE_DIR``.
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export GLUE_DIR=/path/to/glue
|
|
||||||
|
|
||||||
python run_xlnet_classifier.py \
|
|
||||||
--task_name STS-B \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--data_dir $GLUE_DIR/STS-B/ \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 8 \
|
|
||||||
--gradient_accumulation_steps 1 \
|
|
||||||
--learning_rate 5e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir /tmp/mrpc_output/
|
|
||||||
|
|
||||||
Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus>`__ gave evaluation results between 84% and 88%.
|
|
||||||
|
|
||||||
**Distributed training**
|
|
||||||
Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch --nproc_per_node 8 \
|
|
||||||
run_xlnet_classifier.py \
|
|
||||||
--task_name STS-B \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--data_dir $GLUE_DIR/STS-B/ \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 8 \
|
|
||||||
--gradient_accumulation_steps 1 \
|
|
||||||
--learning_rate 5e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir /tmp/mrpc_output/
|
|
||||||
|
|
||||||
Training with these hyper-parameters gave us the following results:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
acc = 0.8823529411764706
|
|
||||||
acc_and_f1 = 0.901702786377709
|
|
||||||
eval_loss = 0.3418912578906332
|
|
||||||
f1 = 0.9210526315789473
|
|
||||||
global_step = 174
|
|
||||||
loss = 0.07231863956341798
|
|
||||||
|
|
||||||
Here is an example on MNLI:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \
|
|
||||||
--bert_model bert-large-uncased-whole-word-masking \
|
|
||||||
--task_name mnli \
|
|
||||||
--do_train \
|
|
||||||
--do_eval \
|
|
||||||
--data_dir /datadrive/bert_data/glue_data//MNLI/ \
|
|
||||||
--max_seq_length 128 \
|
|
||||||
--train_batch_size 8 \
|
|
||||||
--learning_rate 2e-5 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--output_dir ../models/wwm-uncased-finetuned-mnli/ \
|
|
||||||
--overwrite_output_dir
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
***** Eval results *****
|
|
||||||
acc = 0.8679706601466992
|
|
||||||
eval_loss = 0.4911287787382479
|
|
||||||
global_step = 18408
|
|
||||||
loss = 0.04755385363816904
|
|
||||||
|
|
||||||
***** Eval results *****
|
|
||||||
acc = 0.8747965825874695
|
|
||||||
eval_loss = 0.45516540421714036
|
|
||||||
global_step = 18408
|
|
||||||
loss = 0.04755385363816904
|
|
||||||
|
|
||||||
This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
|
|
||||||
@@ -11,6 +11,8 @@ The library currently contains PyTorch implementations, pre-trained model weight
|
|||||||
4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
|
||||||
|
7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
|
8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|||||||
@@ -52,6 +52,12 @@ If you want to reproduce the original tokenization process of the ``OpenAI GPT``
|
|||||||
If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
|
If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
|
||||||
|
|
||||||
|
|
||||||
|
Note on model downloads (Continuous Integration or large-scale deployments)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
|
||||||
|
|
||||||
|
|
||||||
Do you want to run a Transformer model on a mobile device?
|
Do you want to run a Transformer model on a mobile device?
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
|||||||
@@ -2,35 +2,35 @@ DistilBERT
|
|||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
``DistilBertConfig``
|
``DistilBertConfig``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertConfig
|
.. autoclass:: pytorch_transformers.DistilBertConfig
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertTokenizer``
|
``DistilBertTokenizer``
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertTokenizer
|
.. autoclass:: pytorch_transformers.DistilBertTokenizer
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertModel``
|
``DistilBertModel``
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertModel
|
.. autoclass:: pytorch_transformers.DistilBertModel
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForMaskedLM``
|
``DistilBertForMaskedLM``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
|
.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
``DistilBertForSequenceClassification``
|
``DistilBertForSequenceClassification``
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
|
.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
|
||||||
:members:
|
:members:
|
||||||
|
|||||||
357
examples/README.md
Normal file
357
examples/README.md
Normal file
@@ -0,0 +1,357 @@
|
|||||||
|
# Examples
|
||||||
|
|
||||||
|
In this section a few examples are put together. All of these examples work for several models, making use of the very
|
||||||
|
similar API between the different models.
|
||||||
|
|
||||||
|
| Section | Description |
|
||||||
|
|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
|
||||||
|
| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
|
||||||
|
| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
|
||||||
|
| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. |
|
||||||
|
|
||||||
|
## Language model fine-tuning
|
||||||
|
|
||||||
|
Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
|
||||||
|
|
||||||
|
Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT
|
||||||
|
to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa
|
||||||
|
are fine-tuned using a masked language modeling (MLM) loss.
|
||||||
|
|
||||||
|
Before running the following example, you should get a file that contains text on which the language model will be
|
||||||
|
fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
|
||||||
|
|
||||||
|
We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
|
||||||
|
text that will be used for evaluation.
|
||||||
|
|
||||||
|
### GPT-2/GPT and causal language modeling
|
||||||
|
|
||||||
|
The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
|
||||||
|
the tokenization). The loss here is that of causal language modeling.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
|
||||||
|
export TEST_FILE=/path/to/dataset/wiki.test.raw
|
||||||
|
|
||||||
|
python run_lm_finetuning.py \
|
||||||
|
--output_dir=output \
|
||||||
|
--model_type=gpt2 \
|
||||||
|
--model_name_or_path=gpt2 \
|
||||||
|
--do_train \
|
||||||
|
--train_data_file=$TRAIN_FILE \
|
||||||
|
--do_eval \
|
||||||
|
--eval_data_file=$TEST_FILE
|
||||||
|
```
|
||||||
|
|
||||||
|
This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
|
||||||
|
a score of ~20 perplexity once fine-tuned on the dataset.
|
||||||
|
|
||||||
|
### RoBERTa/BERT and masked language modeling
|
||||||
|
|
||||||
|
The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
|
||||||
|
as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
|
||||||
|
pre-training: masked language modeling.
|
||||||
|
|
||||||
|
In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
|
||||||
|
slightly slower (over-fitting takes more epochs).
|
||||||
|
|
||||||
|
We use the `--mlm` flag so that the script may change its loss function.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
|
||||||
|
export TEST_FILE=/path/to/dataset/wiki.test.raw
|
||||||
|
|
||||||
|
python run_lm_finetuning.py \
|
||||||
|
--output_dir=output \
|
||||||
|
--model_type=roberta \
|
||||||
|
--model_name_or_path=roberta-base \
|
||||||
|
--do_train \
|
||||||
|
--train_data_file=$TRAIN_FILE \
|
||||||
|
--do_eval \
|
||||||
|
--eval_data_file=$TEST_FILE \
|
||||||
|
--mlm
|
||||||
|
```
|
||||||
|
|
||||||
|
## Language generation
|
||||||
|
|
||||||
|
Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
|
||||||
|
|
||||||
|
Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
|
||||||
|
A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
|
||||||
|
can try out the different models available in the library.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python run_generation.py \
|
||||||
|
--model_type=gpt2 \
|
||||||
|
--model_name_or_path=gpt2
|
||||||
|
```
|
||||||
|
|
||||||
|
## GLUE
|
||||||
|
|
||||||
|
Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
|
||||||
|
|
||||||
|
Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
|
||||||
|
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
|
||||||
|
|
||||||
|
GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
|
||||||
|
uncased BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
|
||||||
|
batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
|
||||||
|
between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
|
||||||
|
|
||||||
|
| Task | Metric | Result |
|
||||||
|
|-------|------------------------------|-------------|
|
||||||
|
| CoLA | Matthew's corr | 55.75 |
|
||||||
|
| SST-2 | Accuracy | 92.09 |
|
||||||
|
| MRPC | F1/Accuracy | 90.48/86.27 |
|
||||||
|
| STS-B | Person/Spearman corr. | 89.03/88.64 |
|
||||||
|
| QQP | Accuracy/F1 | 90.92/87.72 |
|
||||||
|
| MNLI | Matched acc./Mismatched acc. | 83.74/84.06 |
|
||||||
|
| QNLI | Accuracy | 91.07 |
|
||||||
|
| RTE | Accuracy | 68.59 |
|
||||||
|
| WNLI | Accuracy | 43.66 |
|
||||||
|
|
||||||
|
Some of these results are significantly different from the ones reported on the test set
|
||||||
|
of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
|
||||||
|
|
||||||
|
Before running anyone of these GLUE tasks you should download the
|
||||||
|
[GLUE data](https://gluebenchmark.com/tasks) by running
|
||||||
|
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
|
||||||
|
and unpack it to some directory `$GLUE_DIR`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export GLUE_DIR=/path/to/glue
|
||||||
|
export TASK_NAME=MRPC
|
||||||
|
|
||||||
|
python run_glue.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-cased \
|
||||||
|
--task_name $TASK_NAME \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_lower_case \
|
||||||
|
--data_dir $GLUE_DIR/$TASK_NAME \
|
||||||
|
--max_seq_length 128 \
|
||||||
|
--per_gpu_train_batch_size 32 \
|
||||||
|
--learning_rate 2e-5 \
|
||||||
|
--num_train_epochs 3.0 \
|
||||||
|
--output_dir /tmp/$TASK_NAME/
|
||||||
|
```
|
||||||
|
|
||||||
|
where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
|
||||||
|
|
||||||
|
The dev set results will be present within the text file `eval_results.txt` in the specified output_dir.
|
||||||
|
In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate
|
||||||
|
output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
|
||||||
|
|
||||||
|
The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI,
|
||||||
|
CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being
|
||||||
|
said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well,
|
||||||
|
since the data processor for each task inherits from the base class DataProcessor.
|
||||||
|
|
||||||
|
### MRPC
|
||||||
|
|
||||||
|
#### Fine-tuning example
|
||||||
|
|
||||||
|
The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less
|
||||||
|
than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
|
||||||
|
|
||||||
|
Before running anyone of these GLUE tasks you should download the
|
||||||
|
[GLUE data](https://gluebenchmark.com/tasks) by running
|
||||||
|
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
|
||||||
|
and unpack it to some directory `$GLUE_DIR`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export GLUE_DIR=/path/to/glue
|
||||||
|
|
||||||
|
python run_glue.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-cased \
|
||||||
|
--task_name MRPC \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_lower_case \
|
||||||
|
--data_dir $GLUE_DIR/MRPC/ \
|
||||||
|
--max_seq_length 128 \
|
||||||
|
--per_gpu_train_batch_size 32 \
|
||||||
|
--learning_rate 2e-5 \
|
||||||
|
--num_train_epochs 3.0 \
|
||||||
|
--output_dir /tmp/mrpc_output/
|
||||||
|
```
|
||||||
|
|
||||||
|
Our test ran on a few seeds with [the original implementation hyper-
|
||||||
|
parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation
|
||||||
|
results between 84% and 88%.
|
||||||
|
|
||||||
|
#### Using Apex and mixed-precision
|
||||||
|
|
||||||
|
Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install
|
||||||
|
[apex](https://github.com/NVIDIA/apex), then run the following example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export GLUE_DIR=/path/to/glue
|
||||||
|
|
||||||
|
python run_glue.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-cased \
|
||||||
|
--task_name MRPC \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_lower_case \
|
||||||
|
--data_dir $GLUE_DIR/MRPC/ \
|
||||||
|
--max_seq_length 128 \
|
||||||
|
--per_gpu_train_batch_size 32 \
|
||||||
|
--learning_rate 2e-5 \
|
||||||
|
--num_train_epochs 3.0 \
|
||||||
|
--output_dir /tmp/mrpc_output/ \
|
||||||
|
--fp16
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Distributed training
|
||||||
|
|
||||||
|
Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
|
||||||
|
reaches F1 > 92 on MRPC.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export GLUE_DIR=/path/to/glue
|
||||||
|
|
||||||
|
python -m torch.distributed.launch \
|
||||||
|
--nproc_per_node 8 run_glue.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-cased \
|
||||||
|
--task_name MRPC \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_lower_case \
|
||||||
|
--data_dir $GLUE_DIR/MRPC/ \
|
||||||
|
--max_seq_length 128 \
|
||||||
|
--per_gpu_train_batch_size 8 \
|
||||||
|
--learning_rate 2e-5 \
|
||||||
|
--num_train_epochs 3.0 \
|
||||||
|
--output_dir /tmp/mrpc_output/
|
||||||
|
```
|
||||||
|
|
||||||
|
Training with these hyper-parameters gave us the following results:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
acc = 0.8823529411764706
|
||||||
|
acc_and_f1 = 0.901702786377709
|
||||||
|
eval_loss = 0.3418912578906332
|
||||||
|
f1 = 0.9210526315789473
|
||||||
|
global_step = 174
|
||||||
|
loss = 0.07231863956341798
|
||||||
|
```
|
||||||
|
|
||||||
|
### MNLI
|
||||||
|
|
||||||
|
The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export GLUE_DIR=/path/to/glue
|
||||||
|
|
||||||
|
python -m torch.distributed.launch \
|
||||||
|
--nproc_per_node 8 run_glue.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-cased \
|
||||||
|
--task_name mnli \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_lower_case \
|
||||||
|
--data_dir $GLUE_DIR/MNLI/ \
|
||||||
|
--max_seq_length 128 \
|
||||||
|
--per_gpu_train_batch_size 8 \
|
||||||
|
--learning_rate 2e-5 \
|
||||||
|
--num_train_epochs 3.0 \
|
||||||
|
--output_dir output_dir \
|
||||||
|
```
|
||||||
|
|
||||||
|
The results are the following:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
***** Eval results *****
|
||||||
|
acc = 0.8679706601466992
|
||||||
|
eval_loss = 0.4911287787382479
|
||||||
|
global_step = 18408
|
||||||
|
loss = 0.04755385363816904
|
||||||
|
|
||||||
|
***** Eval results *****
|
||||||
|
acc = 0.8747965825874695
|
||||||
|
eval_loss = 0.45516540421714036
|
||||||
|
global_step = 18408
|
||||||
|
loss = 0.04755385363816904
|
||||||
|
```
|
||||||
|
|
||||||
|
## SQuAD
|
||||||
|
|
||||||
|
Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
|
||||||
|
|
||||||
|
#### Fine-tuning on SQuAD
|
||||||
|
|
||||||
|
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
|
||||||
|
on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a
|
||||||
|
$SQUAD_DIR directory.
|
||||||
|
|
||||||
|
* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
|
||||||
|
* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
|
||||||
|
* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export SQUAD_DIR=/path/to/SQUAD
|
||||||
|
|
||||||
|
python run_squad.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-cased \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_lower_case \
|
||||||
|
--train_file $SQUAD_DIR/train-v1.1.json \
|
||||||
|
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
||||||
|
--per_gpu_train_batch_size 12 \
|
||||||
|
--learning_rate 3e-5 \
|
||||||
|
--num_train_epochs 2.0 \
|
||||||
|
--max_seq_length 384 \
|
||||||
|
--doc_stride 128 \
|
||||||
|
--output_dir /tmp/debug_squad/
|
||||||
|
```
|
||||||
|
|
||||||
|
Training with the previously defined hyper-parameters yields the following results:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
f1 = 88.52
|
||||||
|
exact_match = 81.22
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Distributed training
|
||||||
|
|
||||||
|
|
||||||
|
Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
|
||||||
|
--model_type bert \
|
||||||
|
--model_name_or_path bert-base-cased \
|
||||||
|
--do_train \
|
||||||
|
--do_eval \
|
||||||
|
--do_lower_case \
|
||||||
|
--train_file $SQUAD_DIR/train-v1.1.json \
|
||||||
|
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
||||||
|
--learning_rate 3e-5 \
|
||||||
|
--num_train_epochs 2 \
|
||||||
|
--max_seq_length 384 \
|
||||||
|
--doc_stride 128 \
|
||||||
|
--output_dir ../models/wwm_uncased_finetuned_squad/ \
|
||||||
|
--per_gpu_train_batch_size 24 \
|
||||||
|
--gradient_accumulation_steps 12
|
||||||
|
```
|
||||||
|
|
||||||
|
Training with the previously defined hyper-parameters yields the following results:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
f1 = 93.15
|
||||||
|
exact_match = 86.91
|
||||||
|
```
|
||||||
|
|
||||||
|
This fine-tuneds model is available as a checkpoint under the reference
|
||||||
|
`bert-large-uncased-whole-word-masking-finetuned-squad`.
|
||||||
|
|
||||||
@@ -9,6 +9,12 @@ DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and l
|
|||||||
For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
|
For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
|
||||||
).
|
).
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||||
|
|
||||||
|
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
|
||||||
|
|
||||||
## How to use DistilBERT
|
## How to use DistilBERT
|
||||||
|
|
||||||
PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
|
||||||
@@ -68,7 +74,7 @@ python train.py \
|
|||||||
|
|
||||||
By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
|
By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
|
||||||
|
|
||||||
We highly encourage you to distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
|
We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export NODE_RANK=0
|
export NODE_RANK=0
|
||||||
@@ -90,11 +96,11 @@ python -m torch.distributed.launch \
|
|||||||
train.py \
|
train.py \
|
||||||
--force \
|
--force \
|
||||||
--n_gpu $WORLD_SIZE \
|
--n_gpu $WORLD_SIZE \
|
||||||
--data_file data/dump_concat_wiki_toronto_bk.bert-base-uncased.pickle \
|
--data_file data/binarized_text.bert-base-uncased.pickle \
|
||||||
--token_counts data/token_counts_concat_wiki_toronto_bk.bert-base-uncased.pickle \
|
--token_counts data/token_counts.bert-base-uncased.pickle \
|
||||||
--dump_path serialization_dir/with_transform/last_word
|
--dump_path serialization_dir/my_first_distillation
|
||||||
```
|
```
|
||||||
|
|
||||||
**Tips** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
|
**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
|
||||||
|
|
||||||
Happy distillation!
|
Happy distillation!
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ class Dataset:
|
|||||||
if sub_s[0] != cls_id:
|
if sub_s[0] != cls_id:
|
||||||
sub_s = np.insert(sub_s, 0, cls_id)
|
sub_s = np.insert(sub_s, 0, cls_id)
|
||||||
if sub_s[-1] != sep_id:
|
if sub_s[-1] != sep_id:
|
||||||
sub_s = np.insert(sub_s, len(sub_s), cls_id)
|
sub_s = np.insert(sub_s, len(sub_s), sep_id)
|
||||||
assert len(sub_s) <= max_len
|
assert len(sub_s) <= max_len
|
||||||
sub_seqs.append(sub_s)
|
sub_seqs.append(sub_s)
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
|
import psutil
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from tqdm import trange, tqdm
|
from tqdm import trange, tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -192,7 +193,7 @@ class Distiller:
|
|||||||
x_prob = self.token_probs[token_ids.flatten()]
|
x_prob = self.token_probs[token_ids.flatten()]
|
||||||
n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
|
n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
|
||||||
tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
|
tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
|
||||||
pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.uint8, device=token_ids.device)
|
pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
|
||||||
pred_mask[tgt_ids] = 1
|
pred_mask[tgt_ids] = 1
|
||||||
pred_mask = pred_mask.view(bs, max_seq_len)
|
pred_mask = pred_mask.view(bs, max_seq_len)
|
||||||
|
|
||||||
@@ -216,7 +217,7 @@ class Distiller:
|
|||||||
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
|
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
|
||||||
token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
|
token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
|
||||||
|
|
||||||
mlm_labels[1-pred_mask] = -1
|
mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||||
|
|
||||||
return token_ids, attn_mask, mlm_labels
|
return token_ids, attn_mask, mlm_labels
|
||||||
|
|
||||||
@@ -294,7 +295,10 @@ class Distiller:
|
|||||||
if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
|
if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
|
||||||
self.end_epoch()
|
self.end_epoch()
|
||||||
|
|
||||||
if self.is_master: logger.info('Training is finished')
|
if self.is_master:
|
||||||
|
logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
|
||||||
|
self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
|
||||||
|
logger.info('Training is finished')
|
||||||
|
|
||||||
def step(self,
|
def step(self,
|
||||||
input_ids: torch.tensor,
|
input_ids: torch.tensor,
|
||||||
@@ -379,9 +383,9 @@ class Distiller:
|
|||||||
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
|
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
|
||||||
else:
|
else:
|
||||||
torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
|
torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
|
||||||
self.scheduler.step()
|
|
||||||
self.optimizer.step()
|
self.optimizer.step()
|
||||||
self.optimizer.zero_grad()
|
self.optimizer.zero_grad()
|
||||||
|
self.scheduler.step()
|
||||||
|
|
||||||
def iter(self):
|
def iter(self):
|
||||||
"""
|
"""
|
||||||
@@ -418,6 +422,8 @@ class Distiller:
|
|||||||
if self.alpha_mse > 0.:
|
if self.alpha_mse > 0.:
|
||||||
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
||||||
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
|
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
|
||||||
|
|
||||||
|
self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
|
||||||
|
|
||||||
def end_epoch(self):
|
def end_epoch(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1 +1,4 @@
|
|||||||
gitpython==3.0.2
|
gitpython==3.0.2
|
||||||
|
tensorboard>=1.14.0
|
||||||
|
tensorboardX==1.8
|
||||||
|
psutil==5.6.3
|
||||||
|
|||||||
@@ -21,8 +21,12 @@ import random
|
|||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pytorch_transformers import BertTokenizer
|
from pytorch_transformers import BertTokenizer
|
||||||
|
import logging
|
||||||
|
|
||||||
from ..utils import logger
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
level = logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
||||||
@@ -74,4 +78,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -18,8 +18,12 @@ Preprocessing script before training DistilBERT.
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
import argparse
|
import argparse
|
||||||
import pickle
|
import pickle
|
||||||
|
import logging
|
||||||
|
|
||||||
from utils import logger
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
level = logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
|
parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
|
||||||
|
|||||||
@@ -235,8 +235,9 @@ def main():
|
|||||||
|
|
||||||
# Prepare model
|
# Prepare model
|
||||||
model = BertForPreTraining.from_pretrained(args.bert_model)
|
model = BertForPreTraining.from_pretrained(args.bert_model)
|
||||||
if args.fp16:
|
# We don't need to manually call model.half() following Apex's recommend
|
||||||
model.half()
|
# if args.fp16:
|
||||||
|
# model.half()
|
||||||
model.to(device)
|
model.to(device)
|
||||||
if args.local_rank != -1:
|
if args.local_rank != -1:
|
||||||
try:
|
try:
|
||||||
@@ -257,25 +258,36 @@ def main():
|
|||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
|
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
|
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex.optimizers import FP16_Optimizer
|
# from apex.optimizers import FP16_Optimizer
|
||||||
from apex.optimizers import FusedAdam
|
# from apex.optimizers import FusedAdam
|
||||||
|
from apex import amp
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||||
|
|
||||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
# This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
|
||||||
lr=args.learning_rate,
|
# because it's recommended for typical use by Apex. We can make it configured
|
||||||
bias_correction=False,
|
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
|
||||||
max_grad_norm=1.0)
|
|
||||||
if args.loss_scale == 0:
|
# We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer
|
||||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
|
||||||
else:
|
# optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
# lr=args.learning_rate,
|
||||||
else:
|
# bias_correction=False,
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
# max_grad_norm=1.0)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
|
# if args.loss_scale == 0:
|
||||||
|
# optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||||
|
# else:
|
||||||
|
# optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||||
|
# else:
|
||||||
|
# optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
|
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
|
||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
logging.info("***** Running training *****")
|
logging.info("***** Running training *****")
|
||||||
@@ -304,7 +316,10 @@ def main():
|
|||||||
if args.gradient_accumulation_steps > 1:
|
if args.gradient_accumulation_steps > 1:
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
optimizer.backward(loss)
|
# I depricate FP16_Optimizer's backward func and replace as Apex document
|
||||||
|
# optimizer.backward(loss)
|
||||||
|
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||||
|
scaled_loss.backward()
|
||||||
else:
|
else:
|
||||||
loss.backward()
|
loss.backward()
|
||||||
tr_loss += loss.item()
|
tr_loss += loss.item()
|
||||||
|
|||||||
@@ -329,7 +329,8 @@ def main():
|
|||||||
doc = []
|
doc = []
|
||||||
else:
|
else:
|
||||||
tokens = tokenizer.tokenize(line)
|
tokens = tokenizer.tokenize(line)
|
||||||
doc.append(tokens)
|
if tokens:
|
||||||
|
doc.append(tokens)
|
||||||
if doc:
|
if doc:
|
||||||
docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added
|
docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added
|
||||||
if len(docs) <= 1:
|
if len(docs) <= 1:
|
||||||
|
|||||||
@@ -474,6 +474,7 @@ def main():
|
|||||||
# Evaluation
|
# Evaluation
|
||||||
results = {}
|
results = {}
|
||||||
if args.do_eval and args.local_rank in [-1, 0]:
|
if args.do_eval and args.local_rank in [-1, 0]:
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
checkpoints = [args.output_dir]
|
checkpoints = [args.output_dir]
|
||||||
if args.eval_all_checkpoints:
|
if args.eval_all_checkpoints:
|
||||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
|
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
|
||||||
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
|
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
|
||||||
using a masked language modeling (MLM) loss.
|
using a masked language modeling (MLM) loss.
|
||||||
"""
|
"""
|
||||||
@@ -247,7 +247,6 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||||
eval_output_dir = args.output_dir
|
eval_output_dir = args.output_dir
|
||||||
|
|
||||||
results = {}
|
|
||||||
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
|
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
|
||||||
|
|
||||||
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||||
@@ -289,7 +288,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||||||
logger.info(" %s = %s", key, str(result[key]))
|
logger.info(" %s = %s", key, str(result[key]))
|
||||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||||
|
|
||||||
return results
|
return result
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
140
hubconf.py
140
hubconf.py
@@ -1,30 +1,112 @@
|
|||||||
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
|
from pytorch_transformers import (
|
||||||
|
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
|
||||||
|
)
|
||||||
|
from pytorch_transformers.file_utils import add_start_docstrings
|
||||||
|
|
||||||
from hubconfs.bert_hubconf import (
|
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
|
||||||
bertTokenizer,
|
|
||||||
bertModel,
|
@add_start_docstrings(AutoConfig.__doc__)
|
||||||
bertForNextSentencePrediction,
|
def config(*args, **kwargs):
|
||||||
bertForPreTraining,
|
r"""
|
||||||
bertForMaskedLM,
|
# Using torch.hub !
|
||||||
bertForSequenceClassification,
|
import torch
|
||||||
bertForMultipleChoice,
|
|
||||||
bertForQuestionAnswering,
|
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.
|
||||||
bertForTokenClassification
|
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||||
)
|
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
|
||||||
from hubconfs.gpt_hubconf import (
|
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
|
||||||
openAIGPTTokenizer,
|
assert config.output_attention == True
|
||||||
openAIGPTModel,
|
config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
|
||||||
openAIGPTLMHeadModel,
|
assert config.output_attention == True
|
||||||
openAIGPTDoubleHeadsModel
|
assert unused_kwargs == {'foo': False}
|
||||||
)
|
|
||||||
from hubconfs.gpt2_hubconf import (
|
"""
|
||||||
gpt2Tokenizer,
|
|
||||||
gpt2Model,
|
return AutoConfig.from_pretrained(*args, **kwargs)
|
||||||
gpt2LMHeadModel,
|
|
||||||
gpt2DoubleHeadsModel
|
|
||||||
)
|
@add_start_docstrings(AutoTokenizer.__doc__)
|
||||||
from hubconfs.transformer_xl_hubconf import (
|
def tokenizer(*args, **kwargs):
|
||||||
transformerXLTokenizer,
|
r"""
|
||||||
transformerXLModel,
|
# Using torch.hub !
|
||||||
transformerXLLMHeadModel
|
import torch
|
||||||
)
|
|
||||||
|
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||||
|
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
return AutoTokenizer.from_pretrained(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings(AutoModel.__doc__)
|
||||||
|
def model(*args, **kwargs):
|
||||||
|
r"""
|
||||||
|
# Using torch.hub !
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
return AutoModel.from_pretrained(*args, **kwargs)
|
||||||
|
|
||||||
|
@add_start_docstrings(AutoModelWithLMHead.__doc__)
|
||||||
|
def modelWithLMHead(*args, **kwargs):
|
||||||
|
r"""
|
||||||
|
# Using torch.hub !
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
|
||||||
|
def modelForSequenceClassification(*args, **kwargs):
|
||||||
|
r"""
|
||||||
|
# Using torch.hub !
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
|
||||||
|
def modelForQuestionAnswering(*args, **kwargs):
|
||||||
|
r"""
|
||||||
|
# Using torch.hub !
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
|
||||||
|
|||||||
@@ -1,360 +0,0 @@
|
|||||||
from pytorch_transformers.tokenization_bert import BertTokenizer
|
|
||||||
from pytorch_transformers.modeling_bert import (
|
|
||||||
BertModel,
|
|
||||||
BertForNextSentencePrediction,
|
|
||||||
BertForMaskedLM,
|
|
||||||
BertForMultipleChoice,
|
|
||||||
BertForPreTraining,
|
|
||||||
BertForQuestionAnswering,
|
|
||||||
BertForSequenceClassification,
|
|
||||||
BertForTokenClassification,
|
|
||||||
)
|
|
||||||
|
|
||||||
# A lot of models share the same param doc. Use a decorator
|
|
||||||
# to save typing
|
|
||||||
bert_docstring = """
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load
|
|
||||||
. `bert-base-uncased`
|
|
||||||
. `bert-large-uncased`
|
|
||||||
. `bert-base-cased`
|
|
||||||
. `bert-large-cased`
|
|
||||||
. `bert-base-multilingual-uncased`
|
|
||||||
. `bert-base-multilingual-cased`
|
|
||||||
. `bert-base-chinese`
|
|
||||||
. `bert-base-german-cased`
|
|
||||||
. `bert-large-uncased-whole-word-masking`
|
|
||||||
. `bert-large-cased-whole-word-masking`
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `bert_config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
|
|
||||||
instance
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `bert_config.json` a configuration file for the model
|
|
||||||
. `model.chkpt` a TensorFlow checkpoint
|
|
||||||
from_tf: should we load the weights from a locally saved TensorFlow
|
|
||||||
checkpoint
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models
|
|
||||||
will be cached.
|
|
||||||
state_dict: an optional state dictionary
|
|
||||||
(collections.OrderedDict object) to use instead of Google
|
|
||||||
pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific Bert class
|
|
||||||
(ex: num_labels for BertForSequenceClassification)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def _append_from_pretrained_docstring(docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = fn.__doc__ + docstr
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def bertTokenizer(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a BertTokenizer from a pre-trained/customized vocab file
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path: Path to pretrained model archive
|
|
||||||
or one of pre-trained vocab configs below.
|
|
||||||
* bert-base-uncased
|
|
||||||
* bert-large-uncased
|
|
||||||
* bert-base-cased
|
|
||||||
* bert-large-cased
|
|
||||||
* bert-base-multilingual-uncased
|
|
||||||
* bert-base-multilingual-cased
|
|
||||||
* bert-base-chinese
|
|
||||||
Keyword args:
|
|
||||||
cache_dir: an optional path to a specific directory to download and cache
|
|
||||||
the pre-trained model weights.
|
|
||||||
Default: None
|
|
||||||
do_lower_case: Whether to lower case the input.
|
|
||||||
Only has an effect when do_wordpiece_only=False
|
|
||||||
Default: True
|
|
||||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
|
|
||||||
Default: True
|
|
||||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
|
||||||
Effective maximum length is always the minimum of this
|
|
||||||
value (if specified) and the underlying BERT model's
|
|
||||||
sequence length.
|
|
||||||
Default: None
|
|
||||||
never_split: List of tokens which will never be split during tokenization.
|
|
||||||
Only has an effect when do_wordpiece_only=False
|
|
||||||
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
|
|
||||||
|
|
||||||
Example:
|
|
||||||
import torch
|
|
||||||
sentence = 'Hello, World!'
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
toks = tokenizer.tokenize(sentence)
|
|
||||||
['Hello', '##,', 'World', '##!']
|
|
||||||
ids = tokenizer.convert_tokens_to_ids(toks)
|
|
||||||
[8667, 28136, 1291, 28125]
|
|
||||||
"""
|
|
||||||
tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BertModel is the basic BERT Transformer model with a layer of summed token,
|
|
||||||
position and sequence embeddings followed by a series of identical
|
|
||||||
self-attention blocks (12 for BERT-base, 24 for BERT-large).
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
# Load bertModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
|
|
||||||
model.eval()
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
encoded_layers, _ = model(tokens_tensor, segments_tensors)
|
|
||||||
"""
|
|
||||||
model = BertModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertForNextSentencePrediction(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BERT model with next sentence prediction head.
|
|
||||||
This module comprises the BERT model followed by the next sentence
|
|
||||||
classification head.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
# Load bertForNextSentencePrediction
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
|
|
||||||
model.eval()
|
|
||||||
# Predict the next sentence classification logits
|
|
||||||
with torch.no_grad():
|
|
||||||
next_sent_classif_logits = model(tokens_tensor, segments_tensors)
|
|
||||||
"""
|
|
||||||
model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertForPreTraining(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BERT model with pre-training heads.
|
|
||||||
This module comprises the BERT model followed by the two pre-training heads
|
|
||||||
- the masked language modeling head, and
|
|
||||||
- the next sentence classification head.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
# Load bertForPreTraining
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
|
|
||||||
masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
|
|
||||||
"""
|
|
||||||
model = BertForPreTraining.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertForMaskedLM(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BertForMaskedLM includes the BertModel Transformer followed by the
|
|
||||||
(possibly) pre-trained masked language modeling head.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
masked_index = 8
|
|
||||||
tokenized_text[masked_index] = '[MASK]'
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
# Load bertForMaskedLM
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
|
|
||||||
model.eval()
|
|
||||||
# Predict all tokens
|
|
||||||
with torch.no_grad():
|
|
||||||
predictions = model(tokens_tensor, segments_tensors)
|
|
||||||
predicted_index = torch.argmax(predictions[0, masked_index]).item()
|
|
||||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
|
||||||
'henson'
|
|
||||||
"""
|
|
||||||
model = BertForMaskedLM.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertForSequenceClassification(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BertForSequenceClassification is a fine-tuning model that includes
|
|
||||||
BertModel and a sequence-level (sequence or pair of sequences) classifier
|
|
||||||
on top of the BertModel. Note that the classification head is only initialized
|
|
||||||
and has to be trained.
|
|
||||||
|
|
||||||
The sequence-level classifier is a linear layer that takes as input the
|
|
||||||
last hidden state of the first character in the input sequence
|
|
||||||
(see Figures 3a and 3b in the BERT paper).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
num_labels: the number (>=2) of classes for the classifier.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
# Load bertForSequenceClassification
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
|
|
||||||
model.eval()
|
|
||||||
# Predict the sequence classification logits
|
|
||||||
with torch.no_grad():
|
|
||||||
seq_classif_logits = model(tokens_tensor, segments_tensors)
|
|
||||||
# Or get the sequence classification loss
|
|
||||||
labels = torch.tensor([1])
|
|
||||||
seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
|
|
||||||
"""
|
|
||||||
model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertForMultipleChoice(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
|
|
||||||
linear layer on top of the BertModel. Note that the multiple choice head is
|
|
||||||
only initialized and has to be trained.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
num_choices: the number (>=2) of classes for the classifier.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
|
|
||||||
segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
|
|
||||||
# Load bertForMultipleChoice
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
|
|
||||||
model.eval()
|
|
||||||
# Predict the multiple choice logits
|
|
||||||
with torch.no_grad():
|
|
||||||
multiple_choice_logits = model(tokens_tensor, segments_tensors)
|
|
||||||
# Or get the multiple choice loss
|
|
||||||
labels = torch.tensor([1])
|
|
||||||
multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
|
|
||||||
"""
|
|
||||||
model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertForQuestionAnswering(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BertForQuestionAnswering is a fine-tuning model that includes BertModel
|
|
||||||
with a token-level classifiers on top of the full sequence of last hidden
|
|
||||||
states. Note that the classification head is only initialized
|
|
||||||
and has to be trained.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
# Load bertForQuestionAnswering
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
|
|
||||||
model.eval()
|
|
||||||
# Predict the start and end positions logits
|
|
||||||
with torch.no_grad():
|
|
||||||
start_logits, end_logits = model(tokens_tensor, segments_tensors)
|
|
||||||
# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
|
|
||||||
start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
|
|
||||||
# set model.train() before if training this loss
|
|
||||||
multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
|
|
||||||
"""
|
|
||||||
model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(bert_docstring)
|
|
||||||
def bertForTokenClassification(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
BertForTokenClassification is a fine-tuning model that includes BertModel
|
|
||||||
and a token-level classifier on top of the BertModel. Note that the classification
|
|
||||||
head is only initialized and has to be trained.
|
|
||||||
|
|
||||||
The token-level classifier is a linear layer that takes as input the last
|
|
||||||
hidden state of the sequence.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
num_labels: the number (>=2) of classes for the classifier.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
# Load bertForTokenClassification
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
|
|
||||||
model.eval()
|
|
||||||
# Predict the token classification logits
|
|
||||||
with torch.no_grad():
|
|
||||||
classif_logits = model(tokens_tensor, segments_tensors)
|
|
||||||
# Or get the token classification loss
|
|
||||||
labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
|
|
||||||
classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
|
|
||||||
"""
|
|
||||||
model = BertForTokenClassification.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
@@ -1,168 +0,0 @@
|
|||||||
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
|
|
||||||
from pytorch_transformers.modeling_gpt2 import (
|
|
||||||
GPT2Model,
|
|
||||||
GPT2LMHeadModel,
|
|
||||||
GPT2DoubleHeadsModel
|
|
||||||
)
|
|
||||||
|
|
||||||
# A lot of models share the same param doc. Use a decorator
|
|
||||||
# to save typing
|
|
||||||
gpt2_docstring = """
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load selected in the list of:
|
|
||||||
. `gpt2`, `gpt2-medium`
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `gpt2_config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `gpt2_config.json` a configuration file for the model
|
|
||||||
. a TensorFlow checkpoint with trained weights
|
|
||||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
|
||||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific GPT-2 class
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def _append_from_pretrained_docstring(docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = fn.__doc__ + docstr
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def gpt2Tokenizer(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
|
|
||||||
Peculiarities:
|
|
||||||
- Byte-level BPE
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path: Path to pretrained model archive
|
|
||||||
or one of pre-trained vocab configs below.
|
|
||||||
* gpt2
|
|
||||||
Keyword args:
|
|
||||||
special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
|
|
||||||
Default: None
|
|
||||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
|
||||||
Effective maximum length is always the minimum of this
|
|
||||||
value (if specified) and the underlying BERT model's
|
|
||||||
sequence length.
|
|
||||||
Default: None
|
|
||||||
|
|
||||||
Example:
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
|
||||||
|
|
||||||
text = "Who was Jim Henson ?"
|
|
||||||
indexed_tokens = tokenizer.encode(tokenized_text)
|
|
||||||
"""
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(gpt2_docstring)
|
|
||||||
def gpt2Model(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
gpt2Model is the basic OpenAI GPT-2 Transformer model based on
|
|
||||||
identical stacked masked self-attention blocks and pre-trained
|
|
||||||
on large scale dataset using language modeling signal.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
|
|
||||||
# Load gpt2Model
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
# past can be used to reuse precomputed hidden state in a subsequent predictions
|
|
||||||
with torch.no_grad():
|
|
||||||
hidden_states_1, past = model(tokens_tensor_1)
|
|
||||||
hidden_states_2, past = model(tokens_tensor_2, past=past)
|
|
||||||
"""
|
|
||||||
model = GPT2Model.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(gpt2_docstring)
|
|
||||||
def gpt2LMHeadModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
|
|
||||||
tied (pre-trained) language modeling head on top.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
|
|
||||||
# Load gpt2LMHeadModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
# past can be used to reuse precomputed hidden state in a subsequent predictions
|
|
||||||
with torch.no_grad():
|
|
||||||
predictions_1, past = model(tokens_tensor_1)
|
|
||||||
predictions_2, past = model(tokens_tensor_2, past=past)
|
|
||||||
|
|
||||||
# Get the predicted last token
|
|
||||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
|
||||||
predicted_token = tokenizer.decode([predicted_index])
|
|
||||||
assert predicted_token == ' who'
|
|
||||||
"""
|
|
||||||
model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(gpt2_docstring)
|
|
||||||
def gpt2DoubleHeadsModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
|
|
||||||
tied (pre-trained) language modeling head and a multiple choice
|
|
||||||
classification head (only initialized, not pre-trained).
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
|
||||||
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
|
||||||
tokenized_text1 = tokenizer.tokenize(text1)
|
|
||||||
tokenized_text2 = tokenizer.tokenize(text2)
|
|
||||||
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
|
||||||
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
|
||||||
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
|
||||||
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
|
||||||
|
|
||||||
# Load gpt2DoubleHeadsModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
|
|
||||||
"""
|
|
||||||
model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
@@ -1,186 +0,0 @@
|
|||||||
from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
|
|
||||||
from pytorch_transformers.modeling_openai import (
|
|
||||||
OpenAIGPTModel,
|
|
||||||
OpenAIGPTLMHeadModel,
|
|
||||||
OpenAIGPTDoubleHeadsModel
|
|
||||||
)
|
|
||||||
|
|
||||||
# Dependecies that are not specified in global hubconf.py
|
|
||||||
specific_dependencies = ['spacy', 'ftfy']
|
|
||||||
|
|
||||||
# A lot of models share the same param doc. Use a decorator
|
|
||||||
# to save typing
|
|
||||||
gpt_docstring = """
|
|
||||||
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
|
|
||||||
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
|
|
||||||
Special tokens need to be trained during the fine-tuning if you use them.
|
|
||||||
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
|
|
||||||
|
|
||||||
The embeddings are ordered as follow in the token embeddings matrice:
|
|
||||||
[0, ----------------------
|
|
||||||
... -> word embeddings
|
|
||||||
config.vocab_size - 1, ______________________
|
|
||||||
config.vocab_size,
|
|
||||||
... -> special embeddings
|
|
||||||
config.vocab_size + config.n_special - 1] ______________________
|
|
||||||
|
|
||||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
|
|
||||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
|
||||||
You should use the associate indices to index the embeddings.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load selected in the list of:
|
|
||||||
. `openai-gpt`
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `openai_gpt_config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `openai-gpt-config.json` a configuration file for the model
|
|
||||||
. a series of NumPy files containing OpenAI TensorFlow trained weights
|
|
||||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
|
||||||
state_dict: an optional state dictionary (collections.OrderedDict object)
|
|
||||||
to use instead of pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def _append_from_pretrained_docstring(docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = fn.__doc__ + docstr
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def openAIGPTTokenizer(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
|
|
||||||
Peculiarities:
|
|
||||||
- lower case all inputs
|
|
||||||
- uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
|
|
||||||
- argument special_tokens and function set_special_tokens:
|
|
||||||
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path: Path to pretrained model archive
|
|
||||||
or one of pre-trained vocab configs below.
|
|
||||||
* openai-gpt
|
|
||||||
Keyword args:
|
|
||||||
special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
|
|
||||||
Default: None
|
|
||||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
|
||||||
Effective maximum length is always the minimum of this
|
|
||||||
value (if specified) and the underlying BERT model's
|
|
||||||
sequence length.
|
|
||||||
Default: None
|
|
||||||
|
|
||||||
Example:
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
|
||||||
|
|
||||||
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
[763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
|
|
||||||
"""
|
|
||||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(gpt_docstring)
|
|
||||||
def openAIGPTModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
|
|
||||||
identical stacked masked self-attention blocks and pre-trained
|
|
||||||
on large scale dataset using language modeling signal.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
|
|
||||||
# Load openAIGPTModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
hidden_states = model(tokens_tensor)
|
|
||||||
"""
|
|
||||||
model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(gpt_docstring)
|
|
||||||
def openAIGPTLMHeadModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
|
|
||||||
tied (pre-trained) language modeling head on top.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
|
|
||||||
# Load openAIGPTLMHeadModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
predictions = model(tokens_tensor)
|
|
||||||
|
|
||||||
# Get the predicted last token
|
|
||||||
predicted_index = torch.argmax(predictions[0, -1, :]).item()
|
|
||||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
|
||||||
'.</w>'
|
|
||||||
"""
|
|
||||||
model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(gpt_docstring)
|
|
||||||
def openAIGPTDoubleHeadsModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
|
|
||||||
tied (pre-trained) language modeling head and a multiple choice
|
|
||||||
classification head (only initialized, not pre-trained).
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
|
||||||
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
|
||||||
tokenized_text1 = tokenizer.tokenize(text1)
|
|
||||||
tokenized_text2 = tokenizer.tokenize(text2)
|
|
||||||
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
|
||||||
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
|
||||||
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
|
||||||
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
|
||||||
|
|
||||||
# Load openAIGPTDoubleHeadsModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
|
|
||||||
"""
|
|
||||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
@@ -1,130 +0,0 @@
|
|||||||
from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
|
|
||||||
from pytorch_transformers.modeling_transfo_xl import (
|
|
||||||
TransfoXLModel,
|
|
||||||
TransfoXLLMHeadModel
|
|
||||||
)
|
|
||||||
|
|
||||||
# A lot of models share the same param doc. Use a decorator
|
|
||||||
# to save typing
|
|
||||||
transformer_xl_docstring = """
|
|
||||||
Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
|
|
||||||
- you don't need to specify positioning embeddings indices
|
|
||||||
- the tokens in the vocabulary have to be sorted to decreasing frequency.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load selected in the list of:
|
|
||||||
. `transfo-xl-wt103`
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `transfo_xl_config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `transfo_xl_config.json` a configuration file for the model
|
|
||||||
. `model.chkpt` a TensorFlow checkpoint
|
|
||||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
|
||||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific TransformerXL class
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def _append_from_pretrained_docstring(docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = fn.__doc__ + docstr
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def transformerXLTokenizer(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path: Path to pretrained model archive
|
|
||||||
or one of pre-trained vocab configs below.
|
|
||||||
* transfo-xl-wt103
|
|
||||||
|
|
||||||
Example:
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
|
|
||||||
|
|
||||||
text = "Who was Jim Henson ?"
|
|
||||||
tokenized_text = tokenizer.tokenize(tokenized_text)
|
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
||||||
"""
|
|
||||||
tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(transformer_xl_docstring)
|
|
||||||
def transformerXLModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
transformerXLModel is the basic Transformer XL model.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
tokenized_text_1 = tokenizer.tokenize(text_1)
|
|
||||||
tokenized_text_2 = tokenizer.tokenize(text_2)
|
|
||||||
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
|
|
||||||
# Load transformerXLModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
# We can re-use the memory cells in a subsequent call to attend a longer context
|
|
||||||
with torch.no_grad():
|
|
||||||
hidden_states_1, mems_1 = model(tokens_tensor_1)
|
|
||||||
hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
|
|
||||||
"""
|
|
||||||
model = TransfoXLModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(transformer_xl_docstring)
|
|
||||||
def transformerXLLMHeadModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
transformerXLModel is the basic Transformer XL model with the
|
|
||||||
tied (pre-trained) language modeling head on top.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
tokenized_text_1 = tokenizer.tokenize(text_1)
|
|
||||||
tokenized_text_2 = tokenizer.tokenize(text_2)
|
|
||||||
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
|
|
||||||
# Load transformerXLLMHeadModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
# We can re-use the memory cells in a subsequent call to attend a longer context
|
|
||||||
with torch.no_grad():
|
|
||||||
predictions_1, mems_1 = model(tokens_tensor_1)
|
|
||||||
predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
|
|
||||||
|
|
||||||
# Get the predicted last token
|
|
||||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
|
||||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
|
||||||
assert predicted_token == 'who'
|
|
||||||
"""
|
|
||||||
model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
@@ -1,167 +0,0 @@
|
|||||||
from pytorch_transformers.tokenization_xlm import XLMTokenizer
|
|
||||||
from pytorch_transformers.modeling_xlm import (
|
|
||||||
XLMConfig,
|
|
||||||
XLMModel,
|
|
||||||
XLMWithLMHeadModel,
|
|
||||||
XLMForSequenceClassification,
|
|
||||||
XLMForQuestionAnswering
|
|
||||||
)
|
|
||||||
|
|
||||||
# A lot of models share the same param doc. Use a decorator
|
|
||||||
# to save typing
|
|
||||||
xlm_start_docstring = """
|
|
||||||
Model class adapted from the XLM Transformer model of
|
|
||||||
"Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
|
|
||||||
Paper: https://arxiv.org/abs/1901.07291
|
|
||||||
Original code: https://github.com/facebookresearch/XLM
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
"""
|
|
||||||
|
|
||||||
# A lot of models share the same param doc. Use a decorator
|
|
||||||
# to save typing
|
|
||||||
xlm_end_docstring = """
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load selected in the list of:
|
|
||||||
. `xlm-mlm-en-2048`
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
|
||||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific XLM class
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def _begin_with_docstring(docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = fn.__doc__ + docstr
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
def _end_with_docstring(docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = fn.__doc__ + docstr
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def xlmTokenizer(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path: Path to pretrained model archive
|
|
||||||
or one of pre-trained vocab configs below.
|
|
||||||
* xlm-mlm-en-2048
|
|
||||||
Keyword args:
|
|
||||||
special_tokens: Special tokens in vocabulary that are not pretrained
|
|
||||||
Default: None
|
|
||||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
|
||||||
Effective maximum length is always the minimum of this
|
|
||||||
value (if specified) and the underlying model's
|
|
||||||
sequence length.
|
|
||||||
Default: None
|
|
||||||
|
|
||||||
Example:
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
|
|
||||||
|
|
||||||
text = "Who was Jim Henson ?"
|
|
||||||
indexed_tokens = tokenizer.encode(tokenized_text)
|
|
||||||
"""
|
|
||||||
tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
@_begin_with_docstring(xlm_start_docstring)
|
|
||||||
@_end_with_docstring(xlm_end_docstring)
|
|
||||||
def xlmModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
# Load xlmModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
hidden_states_1, mems = model(tokens_tensor_1)
|
|
||||||
hidden_states_2, mems = model(tokens_tensor_2, past=mems)
|
|
||||||
"""
|
|
||||||
model = XLMModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_begin_with_docstring(xlm_start_docstring)
|
|
||||||
@_end_with_docstring(xlm_end_docstring)
|
|
||||||
def xlmLMHeadModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
|
|
||||||
# Load xlnetLMHeadModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
predictions_1, mems = model(tokens_tensor_1)
|
|
||||||
predictions_2, mems = model(tokens_tensor_2, mems=mems)
|
|
||||||
|
|
||||||
# Get the predicted last token
|
|
||||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
|
||||||
predicted_token = tokenizer.decode([predicted_index])
|
|
||||||
assert predicted_token == ' who'
|
|
||||||
"""
|
|
||||||
model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
# @_end_with_docstring(xlnet_docstring)
|
|
||||||
# def xlnetForSequenceClassification(*args, **kwargs):
|
|
||||||
# """
|
|
||||||
# xlnetModel is the basic XLNet Transformer model from
|
|
||||||
# "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
|
||||||
# by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
|
||||||
|
|
||||||
# Example:
|
|
||||||
# # Load the tokenizer
|
|
||||||
# import torch
|
|
||||||
# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
|
|
||||||
|
|
||||||
# # Prepare tokenized input
|
|
||||||
# text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
|
||||||
# text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
|
||||||
# tokenized_text1 = tokenizer.tokenize(text1)
|
|
||||||
# tokenized_text2 = tokenizer.tokenize(text2)
|
|
||||||
# indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
|
||||||
# indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
|
||||||
# tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
|
||||||
# mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
|
||||||
|
|
||||||
# # Load xlnetForSequenceClassification
|
|
||||||
# model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
|
|
||||||
# model.eval()
|
|
||||||
|
|
||||||
# # Predict sequence classes logits
|
|
||||||
# with torch.no_grad():
|
|
||||||
# lm_logits, mems = model(tokens_tensor)
|
|
||||||
# """
|
|
||||||
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
|
|
||||||
# return model
|
|
||||||
@@ -1,169 +0,0 @@
|
|||||||
from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
|
|
||||||
from pytorch_transformers.modeling_xlnet import (
|
|
||||||
XLNetConfig,
|
|
||||||
XLNetModel,
|
|
||||||
XLNetLMHeadModel,
|
|
||||||
# XLNetForSequenceClassification
|
|
||||||
)
|
|
||||||
|
|
||||||
# A lot of models share the same param doc. Use a decorator
|
|
||||||
# to save typing
|
|
||||||
xlnet_docstring = """
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load selected in the list of:
|
|
||||||
. `xlnet-large-cased`
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `xlnet_config.json` a configuration file for the model
|
|
||||||
. `model.chkpt` a TensorFlow checkpoint
|
|
||||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
|
||||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific XLNet class
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def _append_from_pretrained_docstring(docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = fn.__doc__ + docstr
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def xlnetTokenizer(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
|
|
||||||
Peculiarities:
|
|
||||||
- require Google sentencepiece (https://github.com/google/sentencepiece)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pretrained_model_name_or_path: Path to pretrained model archive
|
|
||||||
or one of pre-trained vocab configs below.
|
|
||||||
* xlnet-large-cased
|
|
||||||
Keyword args:
|
|
||||||
special_tokens: Special tokens in vocabulary that are not pretrained
|
|
||||||
Default: None
|
|
||||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
|
||||||
Effective maximum length is always the minimum of this
|
|
||||||
value (if specified) and the underlying model's
|
|
||||||
sequence length.
|
|
||||||
Default: None
|
|
||||||
|
|
||||||
Example:
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
|
||||||
|
|
||||||
text = "Who was Jim Henson ?"
|
|
||||||
indexed_tokens = tokenizer.encode(tokenized_text)
|
|
||||||
"""
|
|
||||||
tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(xlnet_docstring)
|
|
||||||
def xlnetModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
xlnetModel is the basic XLNet Transformer model from
|
|
||||||
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
|
||||||
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
|
|
||||||
# Load xlnetModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
hidden_states_1, mems = model(tokens_tensor_1)
|
|
||||||
hidden_states_2, mems = model(tokens_tensor_2, past=mems)
|
|
||||||
"""
|
|
||||||
model = XLNetModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@_append_from_pretrained_docstring(xlnet_docstring)
|
|
||||||
def xlnetLMHeadModel(*args, **kwargs):
|
|
||||||
"""
|
|
||||||
xlnetModel is the basic XLNet Transformer model from
|
|
||||||
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
|
||||||
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
|
||||||
with a tied (pre-trained) language modeling head on top.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
# Load the tokenizer
|
|
||||||
import torch
|
|
||||||
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
|
||||||
|
|
||||||
# Prepare tokenized input
|
|
||||||
text_1 = "Who was Jim Henson ?"
|
|
||||||
text_2 = "Jim Henson was a puppeteer"
|
|
||||||
indexed_tokens_1 = tokenizer.encode(text_1)
|
|
||||||
indexed_tokens_2 = tokenizer.encode(text_2)
|
|
||||||
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
|
||||||
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
|
||||||
|
|
||||||
# Load xlnetLMHeadModel
|
|
||||||
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
predictions_1, mems = model(tokens_tensor_1)
|
|
||||||
predictions_2, mems = model(tokens_tensor_2, mems=mems)
|
|
||||||
|
|
||||||
# Get the predicted last token
|
|
||||||
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
|
||||||
predicted_token = tokenizer.decode([predicted_index])
|
|
||||||
assert predicted_token == ' who'
|
|
||||||
"""
|
|
||||||
model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
# @_append_from_pretrained_docstring(xlnet_docstring)
|
|
||||||
# def xlnetForSequenceClassification(*args, **kwargs):
|
|
||||||
# """
|
|
||||||
# xlnetModel is the basic XLNet Transformer model from
|
|
||||||
# "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
|
||||||
# by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
|
||||||
|
|
||||||
# Example:
|
|
||||||
# # Load the tokenizer
|
|
||||||
# import torch
|
|
||||||
# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
|
|
||||||
|
|
||||||
# # Prepare tokenized input
|
|
||||||
# text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
|
||||||
# text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
|
||||||
# tokenized_text1 = tokenizer.tokenize(text1)
|
|
||||||
# tokenized_text2 = tokenizer.tokenize(text2)
|
|
||||||
# indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
|
||||||
# indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
|
||||||
# tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
|
||||||
# mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
|
||||||
|
|
||||||
# # Load xlnetForSequenceClassification
|
|
||||||
# model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
|
|
||||||
# model.eval()
|
|
||||||
|
|
||||||
# # Predict sequence classes logits
|
|
||||||
# with torch.no_grad():
|
|
||||||
# lm_logits, mems = model(tokens_tensor)
|
|
||||||
# """
|
|
||||||
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
|
|
||||||
# return model
|
|
||||||
@@ -1,4 +1,18 @@
|
|||||||
__version__ = "1.1.0"
|
__version__ = "1.2.0"
|
||||||
|
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||||
|
# default Python logging output behavior when present.
|
||||||
|
# see: https://github.com/abseil/abseil-py/issues/99
|
||||||
|
# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
|
||||||
|
try:
|
||||||
|
import absl.logging
|
||||||
|
absl.logging.set_verbosity('info')
|
||||||
|
absl.logging.set_stderrthreshold('info')
|
||||||
|
absl.logging._warn_preinit_stderr = False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Tokenizer
|
||||||
|
from .tokenization_utils import (PreTrainedTokenizer)
|
||||||
from .tokenization_auto import AutoTokenizer
|
from .tokenization_auto import AutoTokenizer
|
||||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||||
from .tokenization_openai import OpenAIGPTTokenizer
|
from .tokenization_openai import OpenAIGPTTokenizer
|
||||||
@@ -9,46 +23,53 @@ from .tokenization_xlm import XLMTokenizer
|
|||||||
from .tokenization_roberta import RobertaTokenizer
|
from .tokenization_roberta import RobertaTokenizer
|
||||||
from .tokenization_distilbert import DistilBertTokenizer
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
|
|
||||||
from .tokenization_utils import (PreTrainedTokenizer)
|
# Configurations
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
from .configuration_auto import AutoConfig
|
||||||
|
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_auto import (AutoConfig, AutoModel)
|
# Modeling
|
||||||
|
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
|
||||||
|
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
|
||||||
|
AutoModelWithLMHead)
|
||||||
|
|
||||||
from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
|
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
|
||||||
BertForMaskedLM, BertForNextSentencePrediction,
|
BertForMaskedLM, BertForNextSentencePrediction,
|
||||||
BertForSequenceClassification, BertForMultipleChoice,
|
BertForSequenceClassification, BertForMultipleChoice,
|
||||||
BertForTokenClassification, BertForQuestionAnswering,
|
BertForTokenClassification, BertForQuestionAnswering,
|
||||||
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
||||||
from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
||||||
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
||||||
from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
|
||||||
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model,
|
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
||||||
load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
||||||
from .modeling_xlnet import (XLNetConfig,
|
|
||||||
XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
|
||||||
XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
|
XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
|
||||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
||||||
from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
|
|
||||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||||
XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
|
||||||
from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
|
RobertaForMultipleChoice, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
RobertaForMultipleChoice,
|
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
||||||
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
|
||||||
from .modeling_distilbert import (DistilBertConfig, DistilBertForMaskedLM, DistilBertModel,
|
|
||||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
|
|
||||||
PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
|
|
||||||
|
|
||||||
|
# Optimization
|
||||||
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
||||||
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
||||||
|
|
||||||
from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
|
# Files and general utilities
|
||||||
|
from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
|
||||||
|
cached_path, add_start_docstrings, add_end_docstrings,
|
||||||
|
WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
|
||||||
|
|||||||
135
pytorch_transformers/configuration_auto.py
Normal file
135
pytorch_transformers/configuration_auto.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Auto Model class. """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .configuration_bert import BertConfig
|
||||||
|
from .configuration_openai import OpenAIGPTConfig
|
||||||
|
from .configuration_gpt2 import GPT2Config
|
||||||
|
from .configuration_transfo_xl import TransfoXLConfig
|
||||||
|
from .configuration_xlnet import XLNetConfig
|
||||||
|
from .configuration_xlm import XLMConfig
|
||||||
|
from .configuration_roberta import RobertaConfig
|
||||||
|
from .configuration_distilbert import DistilBertConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class AutoConfig(object):
|
||||||
|
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
|
||||||
|
that will be instantiated as one of the configuration classes of the library
|
||||||
|
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method take care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||||
|
- contains `bert`: BertConfig (Bert model)
|
||||||
|
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||||
|
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||||
|
- contains `xlm`: XLMConfig (XLM model)
|
||||||
|
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throw an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("AutoConfig is designed to be instantiated "
|
||||||
|
"using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
|
r""" Instantiate a one of the configuration classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The configuration class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||||
|
- contains `bert`: BertConfig (Bert model)
|
||||||
|
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||||
|
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||||
|
- contains `xlm`: XLMConfig (XLM model)
|
||||||
|
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
|
||||||
|
|
||||||
|
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||||
|
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
return_unused_kwargs: (`optional`) bool:
|
||||||
|
|
||||||
|
- If False, then this function returns just the final configuration object.
|
||||||
|
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||||
|
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
|
||||||
|
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||||
|
assert config.output_attention == True
|
||||||
|
config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
|
||||||
|
foo=False, return_unused_kwargs=True)
|
||||||
|
assert config.output_attention == True
|
||||||
|
assert unused_kwargs == {'foo': False}
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||||
|
return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'gpt2' in pretrained_model_name_or_path:
|
||||||
|
return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||||
|
return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
|
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
113
pytorch_transformers/configuration_bert.py
Normal file
113
pytorch_transformers/configuration_bert.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" BERT model configuration """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
|
||||||
|
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
|
||||||
|
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
|
||||||
|
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
|
||||||
|
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
|
||||||
|
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
|
||||||
|
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
|
||||||
|
'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
|
||||||
|
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
|
||||||
|
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
|
||||||
|
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
|
||||||
|
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
|
||||||
|
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class BertConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
:class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
|
||||||
|
`BertModel`.
|
||||||
|
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
|
||||||
|
hidden_size: Size of the encoder layers and the pooler layer.
|
||||||
|
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||||
|
num_attention_heads: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||||
|
layer in the Transformer encoder.
|
||||||
|
hidden_act: The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||||
|
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
max_position_embeddings: The maximum sequence length that this model might
|
||||||
|
ever be used with. Typically set this to something large just in case
|
||||||
|
(e.g., 512 or 1024 or 2048).
|
||||||
|
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||||
|
`BertModel`.
|
||||||
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size_or_config_json_file=30522,
|
||||||
|
hidden_size=768,
|
||||||
|
num_hidden_layers=12,
|
||||||
|
num_attention_heads=12,
|
||||||
|
intermediate_size=3072,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
layer_norm_eps=1e-12,
|
||||||
|
**kwargs):
|
||||||
|
super(BertConfig, self).__init__(**kwargs)
|
||||||
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
|
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.type_vocab_size = type_vocab_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
else:
|
||||||
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
|
" or the path to a pretrained model config file (str)")
|
||||||
89
pytorch_transformers/configuration_distilbert.py
Normal file
89
pytorch_transformers/configuration_distilbert.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" DistilBERT model configuration """
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
||||||
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class DistilBertConfig(PretrainedConfig):
|
||||||
|
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size_or_config_json_file=30522,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
sinusoidal_pos_embds=True,
|
||||||
|
n_layers=6,
|
||||||
|
n_heads=12,
|
||||||
|
dim=768,
|
||||||
|
hidden_dim=4*768,
|
||||||
|
dropout=0.1,
|
||||||
|
attention_dropout=0.1,
|
||||||
|
activation='gelu',
|
||||||
|
initializer_range=0.02,
|
||||||
|
tie_weights_=True,
|
||||||
|
qa_dropout=0.1,
|
||||||
|
seq_classif_dropout=0.2,
|
||||||
|
**kwargs):
|
||||||
|
super(DistilBertConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
|
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.n_heads = n_heads
|
||||||
|
self.dim = dim
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.dropout = dropout
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.activation = activation
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.tie_weights_ = tie_weights_
|
||||||
|
self.qa_dropout = qa_dropout
|
||||||
|
self.seq_classif_dropout = seq_classif_dropout
|
||||||
|
else:
|
||||||
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
|
" or the path to a pretrained model config file (str)")
|
||||||
|
@property
|
||||||
|
def hidden_size(self):
|
||||||
|
return self.dim
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_attention_heads(self):
|
||||||
|
return self.n_heads
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_hidden_layers(self):
|
||||||
|
return self.n_layers
|
||||||
143
pytorch_transformers/configuration_gpt2.py
Normal file
143
pytorch_transformers/configuration_gpt2.py
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" OpenAI GPT-2 configuration """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
||||||
|
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
||||||
|
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
|
||||||
|
|
||||||
|
class GPT2Config(PretrainedConfig):
|
||||||
|
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
|
n_positions: Number of positional embeddings.
|
||||||
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||||
|
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attn_pdrop: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
embd_pdrop: The dropout ratio for the embeddings.
|
||||||
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size_or_config_json_file=50257,
|
||||||
|
n_positions=1024,
|
||||||
|
n_ctx=1024,
|
||||||
|
n_embd=768,
|
||||||
|
n_layer=12,
|
||||||
|
n_head=12,
|
||||||
|
resid_pdrop=0.1,
|
||||||
|
embd_pdrop=0.1,
|
||||||
|
attn_pdrop=0.1,
|
||||||
|
layer_norm_epsilon=1e-5,
|
||||||
|
initializer_range=0.02,
|
||||||
|
|
||||||
|
num_labels=1,
|
||||||
|
summary_type='cls_index',
|
||||||
|
summary_use_proj=True,
|
||||||
|
summary_activation=None,
|
||||||
|
summary_proj_to_labels=True,
|
||||||
|
summary_first_dropout=0.1,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
"""Constructs GPT2Config.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
|
n_positions: Number of positional embeddings.
|
||||||
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||||
|
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attn_pdrop: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
embd_pdrop: The dropout ratio for the embeddings.
|
||||||
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
"""
|
||||||
|
super(GPT2Config, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
|
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
|
self.n_ctx = n_ctx
|
||||||
|
self.n_positions = n_positions
|
||||||
|
self.n_embd = n_embd
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
self.resid_pdrop = resid_pdrop
|
||||||
|
self.embd_pdrop = embd_pdrop
|
||||||
|
self.attn_pdrop = attn_pdrop
|
||||||
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.summary_type = summary_type
|
||||||
|
self.summary_use_proj = summary_use_proj
|
||||||
|
self.summary_activation = summary_activation
|
||||||
|
self.summary_first_dropout = summary_first_dropout
|
||||||
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"First argument must be either a vocabulary size (int)"
|
||||||
|
"or the path to a pretrained model config file (str)"
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_position_embeddings(self):
|
||||||
|
return self.n_positions
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hidden_size(self):
|
||||||
|
return self.n_embd
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_attention_heads(self):
|
||||||
|
return self.n_head
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_hidden_layers(self):
|
||||||
|
return self.n_layer
|
||||||
135
pytorch_transformers/configuration_openai.py
Normal file
135
pytorch_transformers/configuration_openai.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" OpenAI GPT configuration """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
class OpenAIGPTConfig(PretrainedConfig):
|
||||||
|
"""
|
||||||
|
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||||
|
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
||||||
|
n_positions: Number of positional embeddings.
|
||||||
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
afn: The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||||
|
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
attn_pdrop: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
embd_pdrop: The dropout ratio for the embeddings.
|
||||||
|
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||||
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size_or_config_json_file=40478,
|
||||||
|
n_positions=512,
|
||||||
|
n_ctx=512,
|
||||||
|
n_embd=768,
|
||||||
|
n_layer=12,
|
||||||
|
n_head=12,
|
||||||
|
afn="gelu",
|
||||||
|
resid_pdrop=0.1,
|
||||||
|
embd_pdrop=0.1,
|
||||||
|
attn_pdrop=0.1,
|
||||||
|
layer_norm_epsilon=1e-5,
|
||||||
|
initializer_range=0.02,
|
||||||
|
predict_special_tokens=True,
|
||||||
|
|
||||||
|
num_labels=1,
|
||||||
|
summary_type='cls_index',
|
||||||
|
summary_use_proj=True,
|
||||||
|
summary_activation=None,
|
||||||
|
summary_proj_to_labels=True,
|
||||||
|
summary_first_dropout=0.1,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
"""Constructs OpenAIGPTConfig.
|
||||||
|
"""
|
||||||
|
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
|
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
|
self.n_ctx = n_ctx
|
||||||
|
self.n_positions = n_positions
|
||||||
|
self.n_embd = n_embd
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
self.afn = afn
|
||||||
|
self.resid_pdrop = resid_pdrop
|
||||||
|
self.embd_pdrop = embd_pdrop
|
||||||
|
self.attn_pdrop = attn_pdrop
|
||||||
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.predict_special_tokens = predict_special_tokens
|
||||||
|
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.summary_type = summary_type
|
||||||
|
self.summary_use_proj = summary_use_proj
|
||||||
|
self.summary_activation = summary_activation
|
||||||
|
self.summary_first_dropout = summary_first_dropout
|
||||||
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"First argument must be either a vocabulary size (int)"
|
||||||
|
"or the path to a pretrained model config file (str)"
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_position_embeddings(self):
|
||||||
|
return self.n_positions
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hidden_size(self):
|
||||||
|
return self.n_embd
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_attention_heads(self):
|
||||||
|
return self.n_head
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_hidden_layers(self):
|
||||||
|
return self.n_layer
|
||||||
35
pytorch_transformers/configuration_roberta.py
Normal file
35
pytorch_transformers/configuration_roberta.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" RoBERTa configuration """
|
||||||
|
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .configuration_bert import BertConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
|
||||||
|
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
|
||||||
|
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class RobertaConfig(BertConfig):
|
||||||
|
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
167
pytorch_transformers/configuration_transfo_xl.py
Normal file
167
pytorch_transformers/configuration_transfo_xl.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Transformer XL configuration """
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
class TransfoXLConfig(PretrainedConfig):
|
||||||
|
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
||||||
|
cutoffs: cutoffs for the adaptive softmax
|
||||||
|
d_model: Dimensionality of the model's hidden states.
|
||||||
|
d_embed: Dimensionality of the embeddings
|
||||||
|
d_head: Dimensionality of the model's heads.
|
||||||
|
div_val: divident value for adapative input and softmax
|
||||||
|
pre_lnorm: apply LayerNorm to the input instead of the output
|
||||||
|
d_inner: Inner dimension in FF
|
||||||
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
tgt_len: number of tokens to predict
|
||||||
|
ext_len: length of the extended context
|
||||||
|
mem_len: length of the retained previous heads
|
||||||
|
same_length: use the same attn length for all tokens
|
||||||
|
proj_share_all_but_first: True to share all but first projs, False not to share.
|
||||||
|
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
||||||
|
clamp_len: use the same pos embeddings after clamp_len
|
||||||
|
sample_softmax: number of samples in sampled softmax
|
||||||
|
adaptive: use adaptive softmax
|
||||||
|
tie_weight: tie the word embedding and softmax weights
|
||||||
|
dropout: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
dropatt: The dropout ratio for the attention probabilities.
|
||||||
|
untie_r: untie relative position biases
|
||||||
|
embd_pdrop: The dropout ratio for the embeddings.
|
||||||
|
init: parameter initializer to use
|
||||||
|
init_range: parameters initialized by U(-init_range, init_range).
|
||||||
|
proj_init_std: parameters initialized by N(0, init_std)
|
||||||
|
init_std: parameters initialized by N(0, init_std)
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size_or_config_json_file=267735,
|
||||||
|
cutoffs=[20000, 40000, 200000],
|
||||||
|
d_model=1024,
|
||||||
|
d_embed=1024,
|
||||||
|
n_head=16,
|
||||||
|
d_head=64,
|
||||||
|
d_inner=4096,
|
||||||
|
div_val=4,
|
||||||
|
pre_lnorm=False,
|
||||||
|
n_layer=18,
|
||||||
|
tgt_len=128,
|
||||||
|
ext_len=0,
|
||||||
|
mem_len=1600,
|
||||||
|
clamp_len=1000,
|
||||||
|
same_length=True,
|
||||||
|
proj_share_all_but_first=True,
|
||||||
|
attn_type=0,
|
||||||
|
sample_softmax=-1,
|
||||||
|
adaptive=True,
|
||||||
|
tie_weight=True,
|
||||||
|
dropout=0.1,
|
||||||
|
dropatt=0.0,
|
||||||
|
untie_r=True,
|
||||||
|
init="normal",
|
||||||
|
init_range=0.01,
|
||||||
|
proj_init_std=0.01,
|
||||||
|
init_std=0.02,
|
||||||
|
**kwargs):
|
||||||
|
"""Constructs TransfoXLConfig.
|
||||||
|
"""
|
||||||
|
super(TransfoXLConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
|
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.n_token = vocab_size_or_config_json_file
|
||||||
|
self.cutoffs = []
|
||||||
|
self.cutoffs.extend(cutoffs)
|
||||||
|
self.tie_weight = tie_weight
|
||||||
|
if proj_share_all_but_first:
|
||||||
|
self.tie_projs = [False] + [True] * len(self.cutoffs)
|
||||||
|
else:
|
||||||
|
self.tie_projs = [False] + [False] * len(self.cutoffs)
|
||||||
|
self.d_model = d_model
|
||||||
|
self.d_embed = d_embed
|
||||||
|
self.d_head = d_head
|
||||||
|
self.d_inner = d_inner
|
||||||
|
self.div_val = div_val
|
||||||
|
self.pre_lnorm = pre_lnorm
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
self.tgt_len = tgt_len
|
||||||
|
self.ext_len = ext_len
|
||||||
|
self.mem_len = mem_len
|
||||||
|
self.same_length = same_length
|
||||||
|
self.attn_type = attn_type
|
||||||
|
self.clamp_len = clamp_len
|
||||||
|
self.sample_softmax = sample_softmax
|
||||||
|
self.adaptive = adaptive
|
||||||
|
self.dropout = dropout
|
||||||
|
self.dropatt = dropatt
|
||||||
|
self.untie_r = untie_r
|
||||||
|
self.init = init
|
||||||
|
self.init_range = init_range
|
||||||
|
self.proj_init_std = proj_init_std
|
||||||
|
self.init_std = init_std
|
||||||
|
else:
|
||||||
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
|
" or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_position_embeddings(self):
|
||||||
|
return self.tgt_len + self.ext_len + self.mem_len
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
return self.n_token
|
||||||
|
|
||||||
|
@vocab_size.setter
|
||||||
|
def vocab_size(self, value):
|
||||||
|
self.n_token = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hidden_size(self):
|
||||||
|
return self.d_model
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_attention_heads(self):
|
||||||
|
return self.n_head
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_hidden_layers(self):
|
||||||
|
return self.n_layer
|
||||||
205
pytorch_transformers/configuration_utils.py
Normal file
205
pytorch_transformers/configuration_utils.py
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" Configuration base class and utilities."""
|
||||||
|
|
||||||
|
from __future__ import (absolute_import, division, print_function,
|
||||||
|
unicode_literals)
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .file_utils import cached_path, CONFIG_NAME
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class PretrainedConfig(object):
|
||||||
|
r""" Base class for all configuration classes.
|
||||||
|
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
|
||||||
|
It only affects the model's configuration.
|
||||||
|
|
||||||
|
Class attributes (overridden by derived classes):
|
||||||
|
- ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
|
||||||
|
``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
|
||||||
|
``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
|
||||||
|
``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
|
||||||
|
``torchscript``: string, default `False`. Is the model used with Torchscript.
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = {}
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
||||||
|
self.num_labels = kwargs.pop('num_labels', 2)
|
||||||
|
self.output_attentions = kwargs.pop('output_attentions', False)
|
||||||
|
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
||||||
|
self.torchscript = kwargs.pop('torchscript', False)
|
||||||
|
self.pruned_heads = kwargs.pop('pruned_heads', {})
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory):
|
||||||
|
""" Save a configuration object to the directory `save_directory`, so that it
|
||||||
|
can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
|
||||||
|
"""
|
||||||
|
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
||||||
|
|
||||||
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
|
output_config_file = os.path.join(save_directory, CONFIG_NAME)
|
||||||
|
|
||||||
|
self.to_json_file(output_config_file)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
|
r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
|
||||||
|
|
||||||
|
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||||
|
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
return_unused_kwargs: (`optional`) bool:
|
||||||
|
|
||||||
|
- If False, then this function returns just the final configuration object.
|
||||||
|
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
# We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
|
||||||
|
# derived class: BertConfig
|
||||||
|
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||||
|
config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
|
||||||
|
config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||||
|
assert config.output_attention == True
|
||||||
|
config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
|
||||||
|
foo=False, return_unused_kwargs=True)
|
||||||
|
assert config.output_attention == True
|
||||||
|
assert unused_kwargs == {'foo': False}
|
||||||
|
|
||||||
|
"""
|
||||||
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
|
force_download = kwargs.pop('force_download', False)
|
||||||
|
proxies = kwargs.pop('proxies', None)
|
||||||
|
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
||||||
|
|
||||||
|
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||||
|
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
|
||||||
|
elif os.path.isdir(pretrained_model_name_or_path):
|
||||||
|
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
||||||
|
else:
|
||||||
|
config_file = pretrained_model_name_or_path
|
||||||
|
# redirect to the cache, if necessary
|
||||||
|
try:
|
||||||
|
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||||
|
except EnvironmentError as e:
|
||||||
|
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||||
|
logger.error(
|
||||||
|
"Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
||||||
|
config_file))
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
"Model name '{}' was not found in model name list ({}). "
|
||||||
|
"We assumed '{}' was a path or url but couldn't find any file "
|
||||||
|
"associated to this path or url.".format(
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
', '.join(cls.pretrained_config_archive_map.keys()),
|
||||||
|
config_file))
|
||||||
|
raise e
|
||||||
|
if resolved_config_file == config_file:
|
||||||
|
logger.info("loading configuration file {}".format(config_file))
|
||||||
|
else:
|
||||||
|
logger.info("loading configuration file {} from cache at {}".format(
|
||||||
|
config_file, resolved_config_file))
|
||||||
|
|
||||||
|
# Load config
|
||||||
|
config = cls.from_json_file(resolved_config_file)
|
||||||
|
|
||||||
|
if hasattr(config, 'pruned_heads'):
|
||||||
|
config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items())
|
||||||
|
|
||||||
|
# Update config with kwargs if needed
|
||||||
|
to_remove = []
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if hasattr(config, key):
|
||||||
|
setattr(config, key, value)
|
||||||
|
to_remove.append(key)
|
||||||
|
for key in to_remove:
|
||||||
|
kwargs.pop(key, None)
|
||||||
|
|
||||||
|
logger.info("Model config %s", config)
|
||||||
|
if return_unused_kwargs:
|
||||||
|
return config, kwargs
|
||||||
|
else:
|
||||||
|
return config
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, json_object):
|
||||||
|
"""Constructs a `Config` from a Python dictionary of parameters."""
|
||||||
|
config = cls(vocab_size_or_config_json_file=-1)
|
||||||
|
for key, value in json_object.items():
|
||||||
|
config.__dict__[key] = value
|
||||||
|
return config
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json_file(cls, json_file):
|
||||||
|
"""Constructs a `BertConfig` from a json file of parameters."""
|
||||||
|
with open(json_file, "r", encoding='utf-8') as reader:
|
||||||
|
text = reader.read()
|
||||||
|
return cls.from_dict(json.loads(text))
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.__dict__ == other.__dict__
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.to_json_string())
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""Serializes this instance to a Python dictionary."""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""Serializes this instance to a JSON string."""
|
||||||
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path):
|
||||||
|
""" Save this instance to a json file."""
|
||||||
|
with open(json_file_path, "w", encoding='utf-8') as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
184
pytorch_transformers/configuration_xlm.py
Normal file
184
pytorch_transformers/configuration_xlm.py
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" XLM configuration """
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
|
||||||
|
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
|
||||||
|
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
|
||||||
|
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
|
||||||
|
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
|
||||||
|
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
|
||||||
|
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
|
||||||
|
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
|
||||||
|
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
|
||||||
|
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class XLMConfig(PretrainedConfig):
|
||||||
|
"""Configuration class to store the configuration of a `XLMModel`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
|
||||||
|
d_model: Size of the encoder layers and the pooler layer.
|
||||||
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
||||||
|
layer in the Transformer encoder.
|
||||||
|
ff_activation: The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||||
|
untie_r: untie relative position biases
|
||||||
|
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
|
||||||
|
|
||||||
|
dropout: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
dropatt: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
max_position_embeddings: The maximum sequence length that this model might
|
||||||
|
ever be used with. Typically set this to something large just in case
|
||||||
|
(e.g., 512 or 1024 or 2048).
|
||||||
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
|
||||||
|
dropout: float, dropout rate.
|
||||||
|
dropatt: float, dropout rate on attention probabilities.
|
||||||
|
init: str, the initialization scheme, either "normal" or "uniform".
|
||||||
|
init_range: float, initialize the parameters with a uniform distribution
|
||||||
|
in [-init_range, init_range]. Only effective when init="uniform".
|
||||||
|
init_std: float, initialize the parameters with a normal distribution
|
||||||
|
with mean 0 and stddev init_std. Only effective when init="normal".
|
||||||
|
mem_len: int, the number of tokens to cache.
|
||||||
|
reuse_len: int, the number of tokens in the currect batch to be cached
|
||||||
|
and reused in the future.
|
||||||
|
bi_data: bool, whether to use bidirectional input pipeline.
|
||||||
|
Usually set to True during pretraining and False during finetuning.
|
||||||
|
clamp_len: int, clamp all relative distances larger than clamp_len.
|
||||||
|
-1 means no clamping.
|
||||||
|
same_length: bool, whether to use the same attention length for each token.
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size_or_config_json_file=30145,
|
||||||
|
emb_dim=2048,
|
||||||
|
n_layers=12,
|
||||||
|
n_heads=16,
|
||||||
|
dropout=0.1,
|
||||||
|
attention_dropout=0.1,
|
||||||
|
gelu_activation=True,
|
||||||
|
sinusoidal_embeddings=False,
|
||||||
|
causal=False,
|
||||||
|
asm=False,
|
||||||
|
n_langs=1,
|
||||||
|
use_lang_emb=True,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
embed_init_std=2048 ** -0.5,
|
||||||
|
layer_norm_eps=1e-12,
|
||||||
|
init_std=0.02,
|
||||||
|
bos_index=0,
|
||||||
|
eos_index=1,
|
||||||
|
pad_index=2,
|
||||||
|
unk_index=3,
|
||||||
|
mask_index=5,
|
||||||
|
is_encoder=True,
|
||||||
|
|
||||||
|
finetuning_task=None,
|
||||||
|
num_labels=2,
|
||||||
|
summary_type='first',
|
||||||
|
summary_use_proj=True,
|
||||||
|
summary_activation=None,
|
||||||
|
summary_proj_to_labels=True,
|
||||||
|
summary_first_dropout=0.1,
|
||||||
|
start_n_top=5,
|
||||||
|
end_n_top=5,
|
||||||
|
**kwargs):
|
||||||
|
"""Constructs XLMConfig.
|
||||||
|
"""
|
||||||
|
super(XLMConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
|
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.n_words = vocab_size_or_config_json_file
|
||||||
|
self.emb_dim = emb_dim
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.n_heads = n_heads
|
||||||
|
self.dropout = dropout
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.gelu_activation = gelu_activation
|
||||||
|
self.sinusoidal_embeddings = sinusoidal_embeddings
|
||||||
|
self.causal = causal
|
||||||
|
self.asm = asm
|
||||||
|
self.n_langs = n_langs
|
||||||
|
self.use_lang_emb = use_lang_emb
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.bos_index = bos_index
|
||||||
|
self.eos_index = eos_index
|
||||||
|
self.pad_index = pad_index
|
||||||
|
self.unk_index = unk_index
|
||||||
|
self.mask_index = mask_index
|
||||||
|
self.is_encoder = is_encoder
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.embed_init_std = embed_init_std
|
||||||
|
self.init_std = init_std
|
||||||
|
self.finetuning_task = finetuning_task
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.summary_type = summary_type
|
||||||
|
self.summary_use_proj = summary_use_proj
|
||||||
|
self.summary_activation = summary_activation
|
||||||
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
|
self.summary_first_dropout = summary_first_dropout
|
||||||
|
self.start_n_top = start_n_top
|
||||||
|
self.end_n_top = end_n_top
|
||||||
|
else:
|
||||||
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
|
" or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
return self.n_words
|
||||||
|
|
||||||
|
@vocab_size.setter
|
||||||
|
def vocab_size(self, value):
|
||||||
|
self.n_words = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hidden_size(self):
|
||||||
|
return self.emb_dim
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_attention_heads(self):
|
||||||
|
return self.n_heads
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_hidden_layers(self):
|
||||||
|
return self.n_layers
|
||||||
172
pytorch_transformers/configuration_xlnet.py
Normal file
172
pytorch_transformers/configuration_xlnet.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" XLNet configuration """
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import open
|
||||||
|
|
||||||
|
from .configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
|
||||||
|
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class XLNetConfig(PretrainedConfig):
|
||||||
|
"""Configuration class to store the configuration of a ``XLNetModel``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
||||||
|
d_model: Size of the encoder layers and the pooler layer.
|
||||||
|
n_layer: Number of hidden layers in the Transformer encoder.
|
||||||
|
n_head: Number of attention heads for each attention layer in
|
||||||
|
the Transformer encoder.
|
||||||
|
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
||||||
|
layer in the Transformer encoder.
|
||||||
|
ff_activation: The non-linear activation function (function or string) in the
|
||||||
|
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||||
|
untie_r: untie relative position biases
|
||||||
|
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
|
||||||
|
|
||||||
|
dropout: The dropout probabilitiy for all fully connected
|
||||||
|
layers in the embeddings, encoder, and pooler.
|
||||||
|
dropatt: The dropout ratio for the attention
|
||||||
|
probabilities.
|
||||||
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
|
initializing all weight matrices.
|
||||||
|
layer_norm_eps: The epsilon used by LayerNorm.
|
||||||
|
|
||||||
|
dropout: float, dropout rate.
|
||||||
|
dropatt: float, dropout rate on attention probabilities.
|
||||||
|
init: str, the initialization scheme, either "normal" or "uniform".
|
||||||
|
init_range: float, initialize the parameters with a uniform distribution
|
||||||
|
in [-init_range, init_range]. Only effective when init="uniform".
|
||||||
|
init_std: float, initialize the parameters with a normal distribution
|
||||||
|
with mean 0 and stddev init_std. Only effective when init="normal".
|
||||||
|
mem_len: int, the number of tokens to cache.
|
||||||
|
reuse_len: int, the number of tokens in the currect batch to be cached
|
||||||
|
and reused in the future.
|
||||||
|
bi_data: bool, whether to use bidirectional input pipeline.
|
||||||
|
Usually set to True during pretraining and False during finetuning.
|
||||||
|
clamp_len: int, clamp all relative distances larger than clamp_len.
|
||||||
|
-1 means no clamping.
|
||||||
|
same_length: bool, whether to use the same attention length for each token.
|
||||||
|
finetuning_task: name of the glue task on which the model was fine-tuned if any
|
||||||
|
"""
|
||||||
|
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size_or_config_json_file=32000,
|
||||||
|
d_model=1024,
|
||||||
|
n_layer=24,
|
||||||
|
n_head=16,
|
||||||
|
d_inner=4096,
|
||||||
|
ff_activation="gelu",
|
||||||
|
untie_r=True,
|
||||||
|
attn_type="bi",
|
||||||
|
|
||||||
|
initializer_range=0.02,
|
||||||
|
layer_norm_eps=1e-12,
|
||||||
|
|
||||||
|
dropout=0.1,
|
||||||
|
mem_len=None,
|
||||||
|
reuse_len=None,
|
||||||
|
bi_data=False,
|
||||||
|
clamp_len=-1,
|
||||||
|
same_length=False,
|
||||||
|
|
||||||
|
finetuning_task=None,
|
||||||
|
num_labels=2,
|
||||||
|
summary_type='last',
|
||||||
|
summary_use_proj=True,
|
||||||
|
summary_activation='tanh',
|
||||||
|
summary_last_dropout=0.1,
|
||||||
|
start_n_top=5,
|
||||||
|
end_n_top=5,
|
||||||
|
**kwargs):
|
||||||
|
"""Constructs XLNetConfig.
|
||||||
|
"""
|
||||||
|
super(XLNetConfig, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||||
|
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||||
|
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||||
|
json_config = json.loads(reader.read())
|
||||||
|
for key, value in json_config.items():
|
||||||
|
self.__dict__[key] = value
|
||||||
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
|
self.n_token = vocab_size_or_config_json_file
|
||||||
|
self.d_model = d_model
|
||||||
|
self.n_layer = n_layer
|
||||||
|
self.n_head = n_head
|
||||||
|
assert d_model % n_head == 0
|
||||||
|
self.d_head = d_model // n_head
|
||||||
|
self.ff_activation = ff_activation
|
||||||
|
self.d_inner = d_inner
|
||||||
|
self.untie_r = untie_r
|
||||||
|
self.attn_type = attn_type
|
||||||
|
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
|
||||||
|
self.dropout = dropout
|
||||||
|
self.mem_len = mem_len
|
||||||
|
self.reuse_len = reuse_len
|
||||||
|
self.bi_data = bi_data
|
||||||
|
self.clamp_len = clamp_len
|
||||||
|
self.same_length = same_length
|
||||||
|
|
||||||
|
self.finetuning_task = finetuning_task
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.summary_type = summary_type
|
||||||
|
self.summary_use_proj = summary_use_proj
|
||||||
|
self.summary_activation = summary_activation
|
||||||
|
self.summary_last_dropout = summary_last_dropout
|
||||||
|
self.start_n_top = start_n_top
|
||||||
|
self.end_n_top = end_n_top
|
||||||
|
else:
|
||||||
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
|
" or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_position_embeddings(self):
|
||||||
|
return -1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
return self.n_token
|
||||||
|
|
||||||
|
@vocab_size.setter
|
||||||
|
def vocab_size(self, value):
|
||||||
|
self.n_token = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hidden_size(self):
|
||||||
|
return self.d_model
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_attention_heads(self):
|
||||||
|
return self.n_head
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_hidden_layers(self):
|
||||||
|
return self.n_layer
|
||||||
@@ -21,7 +21,7 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
|
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
GPT2Config,
|
GPT2Config,
|
||||||
GPT2Model,
|
GPT2Model,
|
||||||
load_tf_weights_in_gpt2)
|
load_tf_weights_in_gpt2)
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from io import open
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
|
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
OpenAIGPTConfig,
|
OpenAIGPTConfig,
|
||||||
OpenAIGPTModel,
|
OpenAIGPTModel,
|
||||||
load_tf_weights_in_openai_gpt)
|
load_tf_weights_in_openai_gpt)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ import argparse
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from pytorch_transformers.modeling import BertModel
|
from pytorch_transformers import BertModel
|
||||||
|
|
||||||
|
|
||||||
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
|
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
|
||||||
|
|||||||
@@ -23,12 +23,12 @@ import torch
|
|||||||
|
|
||||||
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
||||||
from fairseq.modules import TransformerSentenceEncoderLayer
|
from fairseq.modules import TransformerSentenceEncoderLayer
|
||||||
from pytorch_transformers.modeling_bert import (BertConfig, BertEncoder,
|
from pytorch_transformers import (BertConfig, BertEncoder,
|
||||||
BertIntermediate, BertLayer,
|
BertIntermediate, BertLayer,
|
||||||
BertModel, BertOutput,
|
BertModel, BertOutput,
|
||||||
BertSelfAttention,
|
BertSelfAttention,
|
||||||
BertSelfOutput)
|
BertSelfOutput)
|
||||||
from pytorch_transformers.modeling_roberta import (RobertaEmbeddings,
|
from pytorch_transformers import (RobertaEmbeddings,
|
||||||
RobertaForMaskedLM,
|
RobertaForMaskedLM,
|
||||||
RobertaForSequenceClassification,
|
RobertaForSequenceClassification,
|
||||||
RobertaModel)
|
RobertaModel)
|
||||||
@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
intermediate_size=roberta.args.encoder_ffn_embed_dim,
|
intermediate_size=roberta.args.encoder_ffn_embed_dim,
|
||||||
max_position_embeddings=514,
|
max_position_embeddings=514,
|
||||||
type_vocab_size=1,
|
type_vocab_size=1,
|
||||||
|
layer_norm_eps=1e-5, # PyTorch default used in fairseq
|
||||||
)
|
)
|
||||||
if classification_head:
|
if classification_head:
|
||||||
config.num_labels = roberta.args.num_classes
|
config.num_labels = roberta.args.num_classes
|
||||||
@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
|
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
|
||||||
model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
|
model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
|
||||||
model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
|
model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
|
||||||
model.roberta.embeddings.LayerNorm.variance_epsilon = roberta_sent_encoder.emb_layer_norm.eps
|
|
||||||
|
|
||||||
for i in range(config.num_hidden_layers):
|
for i in range(config.num_hidden_layers):
|
||||||
# Encoder: start of layer
|
# Encoder: start of layer
|
||||||
@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
|
self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
|
||||||
self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
|
self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
|
||||||
self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
|
self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
|
||||||
self_output.LayerNorm.variance_epsilon = roberta_layer.self_attn_layer_norm.eps
|
|
||||||
|
|
||||||
### intermediate
|
### intermediate
|
||||||
intermediate: BertIntermediate = layer.intermediate
|
intermediate: BertIntermediate = layer.intermediate
|
||||||
@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
bert_output.dense.bias = roberta_layer.fc2.bias
|
bert_output.dense.bias = roberta_layer.fc2.bias
|
||||||
bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
|
bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
|
||||||
bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
|
bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
|
||||||
bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps
|
|
||||||
#### end of layer
|
#### end of layer
|
||||||
|
|
||||||
if classification_head:
|
if classification_head:
|
||||||
@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
|
model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
|
||||||
model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
|
model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
|
||||||
model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
|
model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
|
||||||
model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
|
|
||||||
model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
|
model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
|
||||||
model.lm_head.bias = roberta.model.decoder.lm_head.bias
|
model.lm_head.bias = roberta.model.decoder.lm_head.bias
|
||||||
|
|
||||||
@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
|
|||||||
else:
|
else:
|
||||||
their_output = roberta.model(input_ids)[0]
|
their_output = roberta.model(input_ids)[0]
|
||||||
print(our_output.shape, their_output.shape)
|
print(our_output.shape, their_output.shape)
|
||||||
|
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||||
|
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||||
print(
|
print(
|
||||||
"Do both models output the same tensors?",
|
"Do both models output the same tensors?",
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from __future__ import print_function
|
|||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
|
from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ import torch
|
|||||||
import pytorch_transformers.tokenization_transfo_xl as data_utils
|
import pytorch_transformers.tokenization_transfo_xl as data_utils
|
||||||
|
|
||||||
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
||||||
from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel,
|
from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
|
||||||
load_tf_weights_in_transfo_xl)
|
load_tf_weights_in_transfo_xl)
|
||||||
from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
|
from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ from io import open
|
|||||||
import torch
|
import torch
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME
|
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
||||||
from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
|
from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import os
|
|||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
|
from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
|
||||||
XLNetConfig,
|
XLNetConfig,
|
||||||
XLNetLMHeadModel, XLNetForQuestionAnswering,
|
XLNetLMHeadModel, XLNetForQuestionAnswering,
|
||||||
XLNetForSequenceClassification,
|
XLNetForSequenceClassification,
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import sys
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import six
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import fnmatch
|
import fnmatch
|
||||||
@@ -47,8 +48,35 @@ except (AttributeError, ImportError):
|
|||||||
|
|
||||||
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
||||||
|
|
||||||
|
WEIGHTS_NAME = "pytorch_model.bin"
|
||||||
|
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||||
|
CONFIG_NAME = "config.json"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
if not six.PY2:
|
||||||
|
def add_start_docstrings(*docstr):
|
||||||
|
def docstring_decorator(fn):
|
||||||
|
fn.__doc__ = ''.join(docstr) + fn.__doc__
|
||||||
|
return fn
|
||||||
|
return docstring_decorator
|
||||||
|
|
||||||
|
def add_end_docstrings(*docstr):
|
||||||
|
def docstring_decorator(fn):
|
||||||
|
fn.__doc__ = fn.__doc__ + ''.join(docstr)
|
||||||
|
return fn
|
||||||
|
return docstring_decorator
|
||||||
|
else:
|
||||||
|
# Not possible to update class docstrings on python2
|
||||||
|
def add_start_docstrings(*docstr):
|
||||||
|
def docstring_decorator(fn):
|
||||||
|
return fn
|
||||||
|
return docstring_decorator
|
||||||
|
|
||||||
|
def add_end_docstrings(*docstr):
|
||||||
|
def docstring_decorator(fn):
|
||||||
|
return fn
|
||||||
|
return docstring_decorator
|
||||||
|
|
||||||
def url_to_filename(url, etag=None):
|
def url_to_filename(url, etag=None):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -18,120 +18,21 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import torch
|
from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
|
||||||
import torch.nn as nn
|
from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
|
||||||
from torch.nn import CrossEntropyLoss, MSELoss
|
from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
|
||||||
from torch.nn.parameter import Parameter
|
from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
|
||||||
|
from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
|
||||||
from .modeling_bert import BertConfig, BertModel
|
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
||||||
from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
|
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
||||||
from .modeling_gpt2 import GPT2Config, GPT2Model
|
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
||||||
from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
|
|
||||||
from .modeling_xlnet import XLNetConfig, XLNetModel
|
|
||||||
from .modeling_xlm import XLMConfig, XLMModel
|
|
||||||
from .modeling_roberta import RobertaConfig, RobertaModel
|
|
||||||
from .modeling_distilbert import DistilBertConfig, DistilBertModel
|
|
||||||
|
|
||||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||||
|
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class AutoConfig(object):
|
|
||||||
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
|
|
||||||
that will be instantiated as one of the configuration classes of the library
|
|
||||||
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
|
||||||
class method.
|
|
||||||
|
|
||||||
The `from_pretrained()` method take care of returning the correct model class instance
|
|
||||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
|
||||||
|
|
||||||
The base model class to instantiate is selected as the first pattern matching
|
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
|
||||||
- contains `bert`: BertConfig (Bert model)
|
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
|
||||||
|
|
||||||
This class cannot be instantiated using `__init__()` (throw an error).
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
|
||||||
raise EnvironmentError("AutoConfig is designed to be instantiated "
|
|
||||||
"using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r""" Instantiate a one of the configuration classes of the library
|
|
||||||
from a pre-trained model configuration.
|
|
||||||
|
|
||||||
The configuration class to instantiate is selected as the first pattern matching
|
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
|
||||||
- contains `bert`: BertConfig (Bert model)
|
|
||||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
|
||||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
|
||||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
|
||||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
|
||||||
- contains `xlm`: XLMConfig (XLM model)
|
|
||||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
|
||||||
|
|
||||||
Params:
|
|
||||||
**pretrained_model_name_or_path**: either:
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
|
|
||||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
|
||||||
- a path to a `directory` containing a configuration file saved
|
|
||||||
using the `save_pretrained(save_directory)` method.
|
|
||||||
- a path or url to a saved configuration `file`.
|
|
||||||
**cache_dir**: (`optional`) string:
|
|
||||||
Path to a directory in which a downloaded pre-trained model
|
|
||||||
configuration should be cached if the standard cache should not be used.
|
|
||||||
**return_unused_kwargs**: (`optional`) bool:
|
|
||||||
- If False, then this function returns just the final configuration object.
|
|
||||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
|
|
||||||
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
|
|
||||||
ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
|
||||||
**kwargs**: (`optional`) dict:
|
|
||||||
Dictionary of key/value pairs with which to update the configuration object after loading.
|
|
||||||
- The values in kwargs of any keys which are configuration attributes will be used
|
|
||||||
to override the loaded values.
|
|
||||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
|
|
||||||
by the `return_unused_kwargs` keyword parameter.
|
|
||||||
|
|
||||||
Examples::
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
|
||||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
|
||||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
|
|
||||||
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
|
||||||
assert config.output_attention == True
|
|
||||||
config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
|
|
||||||
foo=False, return_unused_kwargs=True)
|
|
||||||
assert config.output_attention == True
|
|
||||||
assert unused_kwargs == {'foo': False}
|
|
||||||
|
|
||||||
"""
|
|
||||||
if 'distilbert' in pretrained_model_name_or_path:
|
|
||||||
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
elif 'roberta' in pretrained_model_name_or_path:
|
|
||||||
return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
|
||||||
return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
|
||||||
return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
elif 'gpt2' in pretrained_model_name_or_path:
|
|
||||||
return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
elif 'transfo-xl' in pretrained_model_name_or_path:
|
|
||||||
return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
elif 'xlnet' in pretrained_model_name_or_path:
|
|
||||||
return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
elif 'xlm' in pretrained_model_name_or_path:
|
|
||||||
return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
||||||
|
|
||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
|
||||||
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
|
||||||
|
|
||||||
|
|
||||||
class AutoModel(object):
|
class AutoModel(object):
|
||||||
r"""
|
r"""
|
||||||
@@ -140,20 +41,21 @@ class AutoModel(object):
|
|||||||
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||||
class method.
|
class method.
|
||||||
|
|
||||||
The `from_pretrained()` method take care of returning the correct model class instance
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
The base model class to instantiate is selected as the first pattern matching
|
The base model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
|
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||||
- contains `bert`: BertModel (Bert model)
|
- contains `bert`: BertModel (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetModel (XLNet model)
|
- contains `xlnet`: XLNetModel (XLNet model)
|
||||||
- contains `xlm`: XLMModel (XLM model)
|
- contains `xlm`: XLMModel (XLM model)
|
||||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
|
||||||
|
|
||||||
This class cannot be instantiated using `__init__()` (throw an error).
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
raise EnvironmentError("AutoModel is designed to be instantiated "
|
raise EnvironmentError("AutoModel is designed to be instantiated "
|
||||||
@@ -161,61 +63,64 @@ class AutoModel(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
r""" Instantiate a one of the base model classes of the library
|
r""" Instantiates one of the base model classes of the library
|
||||||
from a pre-trained model configuration.
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
The base model class to instantiate is selected as the first pattern matching
|
The model class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||||
|
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||||
- contains `bert`: BertModel (Bert model)
|
- contains `bert`: BertModel (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetModel (XLNet model)
|
- contains `xlnet`: XLNetModel (XLNet model)
|
||||||
- contains `xlm`: XLMModel (XLM model)
|
- contains `xlm`: XLMModel (XLM model)
|
||||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
|
||||||
|
|
||||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
To train the model, you should first set it back in training mode with `model.train()`
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
Params:
|
Params:
|
||||||
**pretrained_model_name_or_path**: either:
|
pretrained_model_name_or_path: either:
|
||||||
- a string with the `shortcut name` of a pre-trained model to load from cache
|
|
||||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing a configuration file saved
|
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
using the `save_pretrained(save_directory)` method.
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
|
|
||||||
In this case, ``from_tf`` should be set to True and a configuration object should be
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
provided as `config` argument. This loading option is slower than converting the TensorFlow
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
checkpoint in a PyTorch model using the provided conversion scripts and loading
|
|
||||||
the PyTorch model afterwards.
|
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||||
**model_args**: (`optional`) Sequence:
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
All remaning positional arguments will be passed to the underlying model's __init__ function
|
|
||||||
**config**: an optional configuration for the model to use instead of an automatically loaded configuation.
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
Configuration can be automatically loaded when:
|
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
|
|
||||||
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
|
state_dict: (`optional`) dict:
|
||||||
from saved weights file.
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
|
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
a simpler option.
|
|
||||||
**cache_dir**: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded pre-trained model
|
||||||
configuration should be cached if the standard cache should not be used.
|
configuration should be cached if the standard cache should not be used.
|
||||||
**output_loading_info**: (`optional`) boolean:
|
|
||||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
|
||||||
**kwargs**: (`optional`) dict:
|
|
||||||
Dictionary of key, values to update the configuration object after loading.
|
|
||||||
Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
|
|
||||||
|
|
||||||
- If a configuration is provided with `config`, **kwargs will be directly passed
|
force_download: (`optional`) boolean, default False:
|
||||||
to the underlying model's __init__ method.
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
- If a configuration is not provided, **kwargs will be first passed to the pretrained
|
|
||||||
model configuration class loading function (`PretrainedConfig.from_pretrained`).
|
proxies: (`optional`) dict, default None:
|
||||||
Each key of **kwargs that corresponds to a configuration attribute
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
will be used to override said attribute with the supplied **kwargs value.
|
The proxies are used on each request.
|
||||||
Remaining keys that do not correspond to any configuration attribute will
|
|
||||||
be passed to the underlying model's __init__ function.
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@@ -248,3 +153,345 @@ class AutoModel(object):
|
|||||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class AutoModelWithLMHead(object):
|
||||||
|
r"""
|
||||||
|
:class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
|
||||||
|
that will be instantiated as one of the language modeling model classes of the library
|
||||||
|
when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||||
|
- contains `bert`: BertForMaskedLM (Bert model)
|
||||||
|
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||||
|
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the language modeling model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||||
|
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||||
|
- contains `bert`: BertForMaskedLM (Bert model)
|
||||||
|
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||||
|
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
||||||
|
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||||
|
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||||
|
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||||
|
|
||||||
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = AutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = AutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||||
|
return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'gpt2' in pretrained_model_name_or_path:
|
||||||
|
return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||||
|
return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||||
|
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class AutoModelForSequenceClassification(object):
|
||||||
|
r"""
|
||||||
|
:class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
|
||||||
|
that will be instantiated as one of the sequence classification model classes of the library
|
||||||
|
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||||
|
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||||
|
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||||
|
- contains `xlm`: XLMForSequenceClassification (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the sequence classification model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||||
|
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||||
|
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||||
|
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||||
|
- contains `xlm`: XLMForSequenceClassification (XLM model)
|
||||||
|
|
||||||
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
|
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||||
|
|
||||||
|
|
||||||
|
class AutoModelForQuestionAnswering(object):
|
||||||
|
r"""
|
||||||
|
:class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
|
||||||
|
that will be instantiated as one of the question answering model classes of the library
|
||||||
|
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
|
||||||
|
class method.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||||
|
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||||
|
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||||
|
|
||||||
|
This class cannot be instantiated using `__init__()` (throws an error).
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
|
||||||
|
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||||
|
r""" Instantiates one of the question answering model classes of the library
|
||||||
|
from a pre-trained model configuration.
|
||||||
|
|
||||||
|
The `from_pretrained()` method takes care of returning the correct model class instance
|
||||||
|
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||||
|
|
||||||
|
The model class to instantiate is selected as the first pattern matching
|
||||||
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||||
|
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||||
|
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||||
|
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||||
|
|
||||||
|
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||||
|
To train the model, you should first set it back in training mode with `model.train()`
|
||||||
|
|
||||||
|
Params:
|
||||||
|
pretrained_model_name_or_path: either:
|
||||||
|
|
||||||
|
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
|
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||||
|
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||||
|
|
||||||
|
model_args: (`optional`) Sequence of positional arguments:
|
||||||
|
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
|
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||||
|
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||||
|
|
||||||
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||||
|
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||||
|
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
|
state_dict: (`optional`) dict:
|
||||||
|
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||||
|
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||||
|
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||||
|
|
||||||
|
cache_dir: (`optional`) string:
|
||||||
|
Path to a directory in which a downloaded pre-trained model
|
||||||
|
configuration should be cached if the standard cache should not be used.
|
||||||
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
output_loading_info: (`optional`) boolean:
|
||||||
|
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||||
|
|
||||||
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||||
|
|
||||||
|
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||||
|
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||||
|
model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||||
|
assert model.config.output_attention == True
|
||||||
|
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||||
|
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||||
|
model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||||
|
|
||||||
|
"""
|
||||||
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
|
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlnet' in pretrained_model_name_or_path:
|
||||||
|
return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
elif 'xlm' in pretrained_model_name_or_path:
|
||||||
|
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||||
|
|
||||||
|
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||||
|
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
|
||||||
|
|||||||
@@ -28,8 +28,9 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import CrossEntropyLoss, MSELoss
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
|
||||||
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel,
|
from .modeling_utils import PreTrainedModel, prune_linear_layer
|
||||||
prune_linear_layer, add_start_docstrings)
|
from .configuration_bert import BertConfig
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -49,23 +50,6 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
|
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
|
|
||||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
||||||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
|
|
||||||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
|
|
||||||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
|
|
||||||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
|
|
||||||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
|
|
||||||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
|
|
||||||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
|
|
||||||
'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
|
|
||||||
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
|
|
||||||
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
|
|
||||||
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
|
|
||||||
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
|
|
||||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
|
||||||
""" Load tf checkpoints in a pytorch model.
|
""" Load tf checkpoints in a pytorch model.
|
||||||
"""
|
"""
|
||||||
@@ -149,95 +133,11 @@ def swish(x):
|
|||||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
||||||
|
|
||||||
|
|
||||||
class BertConfig(PretrainedConfig):
|
|
||||||
r"""
|
|
||||||
:class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
|
|
||||||
`BertModel`.
|
|
||||||
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
|
|
||||||
hidden_size: Size of the encoder layers and the pooler layer.
|
|
||||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
|
||||||
num_attention_heads: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
|
||||||
layer in the Transformer encoder.
|
|
||||||
hidden_act: The non-linear activation function (function or string) in the
|
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
|
||||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
max_position_embeddings: The maximum sequence length that this model might
|
|
||||||
ever be used with. Typically set this to something large just in case
|
|
||||||
(e.g., 512 or 1024 or 2048).
|
|
||||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
|
||||||
`BertModel`.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
|
||||||
"""
|
|
||||||
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
vocab_size_or_config_json_file=30522,
|
|
||||||
hidden_size=768,
|
|
||||||
num_hidden_layers=12,
|
|
||||||
num_attention_heads=12,
|
|
||||||
intermediate_size=3072,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
layer_norm_eps=1e-12,
|
|
||||||
**kwargs):
|
|
||||||
super(BertConfig, self).__init__(**kwargs)
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.layer_norm_eps = layer_norm_eps
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
||||||
except (ImportError, AttributeError) as e:
|
except (ImportError, AttributeError) as e:
|
||||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
||||||
class BertLayerNorm(nn.Module):
|
BertLayerNorm = torch.nn.LayerNorm
|
||||||
def __init__(self, hidden_size, eps=1e-12):
|
|
||||||
"""Construct a layernorm module in the TF style (epsilon inside the square root).
|
|
||||||
"""
|
|
||||||
super(BertLayerNorm, self).__init__()
|
|
||||||
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
||||||
self.bias = nn.Parameter(torch.zeros(hidden_size))
|
|
||||||
self.variance_epsilon = eps
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
u = x.mean(-1, keepdim=True)
|
|
||||||
s = (x - u).pow(2).mean(-1, keepdim=True)
|
|
||||||
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
|
|
||||||
return self.weight * x + self.bias
|
|
||||||
|
|
||||||
class BertEmbeddings(nn.Module):
|
class BertEmbeddings(nn.Module):
|
||||||
"""Construct the embeddings from word, position and token_type embeddings.
|
"""Construct the embeddings from word, position and token_type embeddings.
|
||||||
@@ -350,23 +250,30 @@ class BertAttention(nn.Module):
|
|||||||
super(BertAttention, self).__init__()
|
super(BertAttention, self).__init__()
|
||||||
self.self = BertSelfAttention(config)
|
self.self = BertSelfAttention(config)
|
||||||
self.output = BertSelfOutput(config)
|
self.output = BertSelfOutput(config)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
def prune_heads(self, heads):
|
def prune_heads(self, heads):
|
||||||
if len(heads) == 0:
|
if len(heads) == 0:
|
||||||
return
|
return
|
||||||
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
||||||
|
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
|
# Compute how many pruned heads are before the head and move the index accordingly
|
||||||
|
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
mask[head] = 0
|
mask[head] = 0
|
||||||
mask = mask.view(-1).contiguous().eq(1)
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
index = torch.arange(len(mask))[mask].long()
|
index = torch.arange(len(mask))[mask].long()
|
||||||
|
|
||||||
# Prune linear layers
|
# Prune linear layers
|
||||||
self.self.query = prune_linear_layer(self.self.query, index)
|
self.self.query = prune_linear_layer(self.self.query, index)
|
||||||
self.self.key = prune_linear_layer(self.self.key, index)
|
self.self.key = prune_linear_layer(self.self.key, index)
|
||||||
self.self.value = prune_linear_layer(self.self.value, index)
|
self.self.value = prune_linear_layer(self.self.value, index)
|
||||||
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
|
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
|
||||||
# Update hyper params
|
|
||||||
|
# Update hyper params and store pruned heads
|
||||||
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
|
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
|
||||||
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def forward(self, input_tensor, attention_mask, head_mask=None):
|
def forward(self, input_tensor, attention_mask, head_mask=None):
|
||||||
self_outputs = self.self(input_tensor, attention_mask, head_mask)
|
self_outputs = self.self(input_tensor, attention_mask, head_mask)
|
||||||
@@ -544,12 +451,8 @@ class BertPreTrainedModel(PreTrainedModel):
|
|||||||
load_tf_weights = load_tf_weights_in_bert
|
load_tf_weights = load_tf_weights_in_bert
|
||||||
base_model_prefix = "bert"
|
base_model_prefix = "bert"
|
||||||
|
|
||||||
def __init__(self, *inputs, **kwargs):
|
def _init_weights(self, module):
|
||||||
super(BertPreTrainedModel, self).__init__(*inputs, **kwargs)
|
""" Initialize the weights """
|
||||||
|
|
||||||
def init_weights(self, module):
|
|
||||||
""" Initialize the weights.
|
|
||||||
"""
|
|
||||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||||
@@ -606,18 +509,18 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Segment token indices to indicate first and second portions of the inputs.
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
corresponds to a `sentence B` token
|
corresponds to a `sentence B` token
|
||||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
Mask values selected in ``[0, 1]``:
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -662,7 +565,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
self.encoder = BertEncoder(config)
|
self.encoder = BertEncoder(config)
|
||||||
self.pooler = BertPooler(config)
|
self.pooler = BertPooler(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
old_embeddings = self.embeddings.word_embeddings
|
old_embeddings = self.embeddings.word_embeddings
|
||||||
@@ -678,7 +581,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = torch.ones_like(input_ids)
|
attention_mask = torch.ones_like(input_ids)
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
@@ -771,7 +674,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertPreTrainingHeads(config)
|
self.cls = BertPreTrainingHeads(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -781,10 +684,14 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
||||||
self.bert.embeddings.word_embeddings)
|
self.bert.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
next_sentence_label=None, position_ids=None, head_mask=None):
|
masked_lm_labels=None, next_sentence_label=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
sequence_output, pooled_output = outputs[:2]
|
sequence_output, pooled_output = outputs[:2]
|
||||||
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
|
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
|
||||||
@@ -839,7 +746,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertOnlyMLMHead(config)
|
self.cls = BertOnlyMLMHead(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -849,10 +756,14 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
||||||
self.bert.embeddings.word_embeddings)
|
self.bert.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, head_mask=None):
|
masked_lm_labels=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
prediction_scores = self.cls(sequence_output)
|
prediction_scores = self.cls(sequence_output)
|
||||||
@@ -904,12 +815,17 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertOnlyNSPHead(config)
|
self.cls = BertOnlyNSPHead(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
|
next_sentence_label=None):
|
||||||
|
|
||||||
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
|
|
||||||
position_ids=None, head_mask=None):
|
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
seq_relationship_score = self.cls(pooled_output)
|
seq_relationship_score = self.cls(pooled_output)
|
||||||
@@ -965,12 +881,17 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
||||||
|
position_ids=None, head_mask=None, labels=None):
|
||||||
|
|
||||||
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
|
||||||
position_ids=None, head_mask=None):
|
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
pooled_output = self.dropout(pooled_output)
|
pooled_output = self.dropout(pooled_output)
|
||||||
@@ -993,45 +914,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
|
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
|
||||||
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
|
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
|
||||||
BERT_START_DOCSTRING)
|
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
|
||||||
class BertForMultipleChoice(BertPreTrainedModel):
|
class BertForMultipleChoice(BertPreTrainedModel):
|
||||||
r"""
|
r"""
|
||||||
Inputs:
|
|
||||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
|
||||||
|
|
||||||
(a) For sequence pairs:
|
|
||||||
|
|
||||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
|
||||||
|
|
||||||
(b) For single sequences:
|
|
||||||
|
|
||||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0``
|
|
||||||
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Segment token indices to indicate first and second portions of the inputs.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
|
||||||
corresponds to a `sentence B` token
|
|
||||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Mask to avoid performing attention on padding token indices.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
|
||||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
Labels for computing the multiple choice classification loss.
|
Labels for computing the multiple choice classification loss.
|
||||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||||
@@ -1069,18 +954,23 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
self.classifier = nn.Linear(config.hidden_size, 1)
|
self.classifier = nn.Linear(config.hidden_size, 1)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None):
|
position_ids=None, head_mask=None, labels=None):
|
||||||
num_choices = input_ids.shape[1]
|
num_choices = input_ids.shape[1]
|
||||||
|
|
||||||
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||||
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
|
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
||||||
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
|
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
|
||||||
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
|
||||||
outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
|
|
||||||
attention_mask=flat_attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
pooled_output = self.dropout(pooled_output)
|
pooled_output = self.dropout(pooled_output)
|
||||||
@@ -1137,12 +1027,17 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
||||||
|
position_ids=None, head_mask=None, labels=None):
|
||||||
|
|
||||||
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
|
||||||
position_ids=None, head_mask=None):
|
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
sequence_output = self.dropout(sequence_output)
|
sequence_output = self.dropout(sequence_output)
|
||||||
@@ -1211,12 +1106,17 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
|||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
|
start_positions=None, end_positions=None):
|
||||||
|
|
||||||
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
|
|
||||||
end_positions=None, position_ids=None, head_mask=None):
|
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
logits = self.qa_outputs(sequence_output)
|
logits = self.qa_outputs(sequence_output)
|
||||||
|
|||||||
@@ -31,7 +31,9 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from pytorch_transformers.modeling_utils import PretrainedConfig, PreTrainedModel, add_start_docstrings, prune_linear_layer
|
from .modeling_utils import PreTrainedModel, prune_linear_layer
|
||||||
|
from .configuration_distilbert import DistilBertConfig
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -42,69 +44,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
|
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
|
||||||
}
|
}
|
||||||
|
|
||||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
||||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
|
||||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class DistilBertConfig(PretrainedConfig):
|
|
||||||
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
vocab_size_or_config_json_file=30522,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
sinusoidal_pos_embds=True,
|
|
||||||
n_layers=6,
|
|
||||||
n_heads=12,
|
|
||||||
dim=768,
|
|
||||||
hidden_dim=4*768,
|
|
||||||
dropout=0.1,
|
|
||||||
attention_dropout=0.1,
|
|
||||||
activation='gelu',
|
|
||||||
initializer_range=0.02,
|
|
||||||
tie_weights_=True,
|
|
||||||
qa_dropout=0.1,
|
|
||||||
seq_classif_dropout=0.2,
|
|
||||||
**kwargs):
|
|
||||||
super(DistilBertConfig, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.sinusoidal_pos_embds = sinusoidal_pos_embds
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.dim = dim
|
|
||||||
self.hidden_dim = hidden_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
self.attention_dropout = attention_dropout
|
|
||||||
self.activation = activation
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.tie_weights_ = tie_weights_
|
|
||||||
self.qa_dropout = qa_dropout
|
|
||||||
self.seq_classif_dropout = seq_classif_dropout
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
@property
|
|
||||||
def hidden_size(self):
|
|
||||||
return self.dim
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_attention_heads(self):
|
|
||||||
return self.n_heads
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_hidden_layers(self):
|
|
||||||
return self.n_layers
|
|
||||||
|
|
||||||
|
|
||||||
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
|
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
|
||||||
def gelu(x):
|
def gelu(x):
|
||||||
@@ -174,12 +113,16 @@ class MultiHeadSelfAttention(nn.Module):
|
|||||||
self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
||||||
self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
|
||||||
|
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
def prune_heads(self, heads):
|
def prune_heads(self, heads):
|
||||||
attention_head_size = self.dim // self.n_heads
|
attention_head_size = self.dim // self.n_heads
|
||||||
if len(heads) == 0:
|
if len(heads) == 0:
|
||||||
return
|
return
|
||||||
mask = torch.ones(self.n_heads, attention_head_size)
|
mask = torch.ones(self.n_heads, attention_head_size)
|
||||||
|
heads = set(heads) - self.pruned_heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
|
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
mask[head] = 0
|
mask[head] = 0
|
||||||
mask = mask.view(-1).contiguous().eq(1)
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
index = torch.arange(len(mask))[mask].long()
|
index = torch.arange(len(mask))[mask].long()
|
||||||
@@ -191,6 +134,7 @@ class MultiHeadSelfAttention(nn.Module):
|
|||||||
# Update hyper params
|
# Update hyper params
|
||||||
self.n_heads = self.n_heads - len(heads)
|
self.n_heads = self.n_heads - len(heads)
|
||||||
self.dim = attention_head_size * self.n_heads
|
self.dim = attention_head_size * self.n_heads
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def forward(self, query, key, value, mask, head_mask = None):
|
def forward(self, query, key, value, mask, head_mask = None):
|
||||||
"""
|
"""
|
||||||
@@ -395,7 +339,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
|
|||||||
def __init__(self, *inputs, **kwargs):
|
def __init__(self, *inputs, **kwargs):
|
||||||
super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
|
super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||||
|
|
||||||
def init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights.
|
""" Initialize the weights.
|
||||||
"""
|
"""
|
||||||
if isinstance(module, nn.Embedding):
|
if isinstance(module, nn.Embedding):
|
||||||
@@ -480,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
|
|||||||
self.embeddings = Embeddings(config) # Embeddings
|
self.embeddings = Embeddings(config) # Embeddings
|
||||||
self.transformer = Transformer(config) # Encoder
|
self.transformer = Transformer(config) # Encoder
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
old_embeddings = self.embeddings.word_embeddings
|
old_embeddings = self.embeddings.word_embeddings
|
||||||
@@ -568,7 +512,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
|||||||
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
|
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
|
||||||
self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
|
self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
||||||
@@ -580,10 +524,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.vocab_projector,
|
self._tie_or_clone_weights(self.vocab_projector,
|
||||||
self.distilbert.embeddings.word_embeddings)
|
self.distilbert.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None):
|
||||||
dlbrt_output = self.distilbert(input_ids=input_ids,
|
dlbrt_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
||||||
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
||||||
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
|
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
|
||||||
@@ -642,12 +586,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
|||||||
self.classifier = nn.Linear(config.dim, config.num_labels)
|
self.classifier = nn.Linear(config.dim, config.num_labels)
|
||||||
self.dropout = nn.Dropout(config.seq_classif_dropout)
|
self.dropout = nn.Dropout(config.seq_classif_dropout)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, labels=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, head_mask=None, labels=None):
|
||||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
||||||
pooled_output = hidden_state[:, 0] # (bs, dim)
|
pooled_output = hidden_state[:, 0] # (bs, dim)
|
||||||
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
||||||
@@ -716,12 +660,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
|||||||
assert config.num_labels == 2
|
assert config.num_labels == 2
|
||||||
self.dropout = nn.Dropout(config.qa_dropout)
|
self.dropout = nn.Dropout(config.qa_dropout)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None):
|
||||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
||||||
|
|
||||||
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
|
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
|
||||||
|
|||||||
@@ -30,19 +30,15 @@ import torch.nn as nn
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
|
from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
|
||||||
PreTrainedModel, prune_conv1d_layer, SequenceSummary,
|
from .configuration_gpt2 import GPT2Config
|
||||||
add_start_docstrings)
|
from .file_utils import add_start_docstrings
|
||||||
from .modeling_bert import BertLayerNorm as LayerNorm
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
|
||||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
|
||||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
|
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
|
||||||
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
|
|
||||||
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
|
|
||||||
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
|
|
||||||
|
|
||||||
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
|
||||||
""" Load tf checkpoints in a pytorch model
|
""" Load tf checkpoints in a pytorch model
|
||||||
@@ -102,120 +98,6 @@ def gelu(x):
|
|||||||
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||||
|
|
||||||
|
|
||||||
class GPT2Config(PretrainedConfig):
|
|
||||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
|
||||||
n_positions: Number of positional embeddings.
|
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
attn_pdrop: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
"""
|
|
||||||
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
vocab_size_or_config_json_file=50257,
|
|
||||||
n_positions=1024,
|
|
||||||
n_ctx=1024,
|
|
||||||
n_embd=768,
|
|
||||||
n_layer=12,
|
|
||||||
n_head=12,
|
|
||||||
resid_pdrop=0.1,
|
|
||||||
embd_pdrop=0.1,
|
|
||||||
attn_pdrop=0.1,
|
|
||||||
layer_norm_epsilon=1e-5,
|
|
||||||
initializer_range=0.02,
|
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
|
||||||
summary_use_proj=True,
|
|
||||||
summary_activation=None,
|
|
||||||
summary_proj_to_labels=True,
|
|
||||||
summary_first_dropout=0.1,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
"""Constructs GPT2Config.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
|
||||||
n_positions: Number of positional embeddings.
|
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
attn_pdrop: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
"""
|
|
||||||
super(GPT2Config, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
|
||||||
self.n_ctx = n_ctx
|
|
||||||
self.n_positions = n_positions
|
|
||||||
self.n_embd = n_embd
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
self.resid_pdrop = resid_pdrop
|
|
||||||
self.embd_pdrop = embd_pdrop
|
|
||||||
self.attn_pdrop = attn_pdrop
|
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def max_position_embeddings(self):
|
|
||||||
return self.n_positions
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hidden_size(self):
|
|
||||||
return self.n_embd
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_attention_heads(self):
|
|
||||||
return self.n_head
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_hidden_layers(self):
|
|
||||||
return self.n_layer
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Attention(nn.Module):
|
class Attention(nn.Module):
|
||||||
def __init__(self, nx, n_ctx, config, scale=False):
|
def __init__(self, nx, n_ctx, config, scale=False):
|
||||||
super(Attention, self).__init__()
|
super(Attention, self).__init__()
|
||||||
@@ -233,24 +115,31 @@ class Attention(nn.Module):
|
|||||||
self.c_proj = Conv1D(n_state, nx)
|
self.c_proj = Conv1D(n_state, nx)
|
||||||
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
||||||
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
def prune_heads(self, heads):
|
def prune_heads(self, heads):
|
||||||
if len(heads) == 0:
|
if len(heads) == 0:
|
||||||
return
|
return
|
||||||
mask = torch.ones(self.n_head, self.split_size // self.n_head)
|
mask = torch.ones(self.n_head, self.split_size // self.n_head)
|
||||||
|
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
|
# Compute how many pruned heads are before the head and move the index accordingly
|
||||||
|
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
mask[head] = 0
|
mask[head] = 0
|
||||||
mask = mask.view(-1).contiguous().eq(1)
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
index = torch.arange(len(mask))[mask].long()
|
index = torch.arange(len(mask))[mask].long()
|
||||||
index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
|
index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
|
||||||
|
|
||||||
# Prune conv1d layers
|
# Prune conv1d layers
|
||||||
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
|
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
|
||||||
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
|
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
|
||||||
|
|
||||||
# Update hyper params
|
# Update hyper params
|
||||||
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
|
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
|
||||||
self.n_head = self.n_head - len(heads)
|
self.n_head = self.n_head - len(heads)
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def _attn(self, q, k, v, head_mask=None):
|
def _attn(self, q, k, v, attention_mask=None, head_mask=None):
|
||||||
w = torch.matmul(q, k)
|
w = torch.matmul(q, k)
|
||||||
if self.scale:
|
if self.scale:
|
||||||
w = w / math.sqrt(v.size(-1))
|
w = w / math.sqrt(v.size(-1))
|
||||||
@@ -258,6 +147,10 @@ class Attention(nn.Module):
|
|||||||
b = self.bias[:, :, ns-nd:ns, :ns]
|
b = self.bias[:, :, ns-nd:ns, :ns]
|
||||||
w = w * b - 1e4 * (1 - b)
|
w = w * b - 1e4 * (1 - b)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask
|
||||||
|
w = w + attention_mask
|
||||||
|
|
||||||
w = nn.Softmax(dim=-1)(w)
|
w = nn.Softmax(dim=-1)(w)
|
||||||
w = self.attn_dropout(w)
|
w = self.attn_dropout(w)
|
||||||
|
|
||||||
@@ -283,7 +176,7 @@ class Attention(nn.Module):
|
|||||||
else:
|
else:
|
||||||
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
|
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
|
||||||
|
|
||||||
def forward(self, x, layer_past=None, head_mask=None):
|
def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
|
||||||
x = self.c_attn(x)
|
x = self.c_attn(x)
|
||||||
query, key, value = x.split(self.split_size, dim=2)
|
query, key, value = x.split(self.split_size, dim=2)
|
||||||
query = self.split_heads(query)
|
query = self.split_heads(query)
|
||||||
@@ -295,7 +188,7 @@ class Attention(nn.Module):
|
|||||||
value = torch.cat((past_value, value), dim=-2)
|
value = torch.cat((past_value, value), dim=-2)
|
||||||
present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
|
present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
|
||||||
|
|
||||||
attn_outputs = self._attn(query, key, value, head_mask)
|
attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
|
||||||
a = attn_outputs[0]
|
a = attn_outputs[0]
|
||||||
|
|
||||||
a = self.merge_heads(a)
|
a = self.merge_heads(a)
|
||||||
@@ -325,13 +218,16 @@ class Block(nn.Module):
|
|||||||
def __init__(self, n_ctx, config, scale=False):
|
def __init__(self, n_ctx, config, scale=False):
|
||||||
super(Block, self).__init__()
|
super(Block, self).__init__()
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
self.attn = Attention(nx, n_ctx, config, scale)
|
self.attn = Attention(nx, n_ctx, config, scale)
|
||||||
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
self.mlp = MLP(4 * nx, config)
|
self.mlp = MLP(4 * nx, config)
|
||||||
|
|
||||||
def forward(self, x, layer_past=None, head_mask=None):
|
def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
|
||||||
output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
|
output_attn = self.attn(self.ln_1(x),
|
||||||
|
layer_past=layer_past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
head_mask=head_mask)
|
||||||
a = output_attn[0] # output_attn: a, present, (attentions)
|
a = output_attn[0] # output_attn: a, present, (attentions)
|
||||||
|
|
||||||
x = x + a
|
x = x + a
|
||||||
@@ -354,7 +250,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|||||||
def __init__(self, *inputs, **kwargs):
|
def __init__(self, *inputs, **kwargs):
|
||||||
super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
|
super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||||
|
|
||||||
def init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights.
|
""" Initialize the weights.
|
||||||
"""
|
"""
|
||||||
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
||||||
@@ -363,7 +259,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
|
if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, LayerNorm):
|
elif isinstance(module, nn.LayerNorm):
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
@@ -394,20 +290,24 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices of input sequence tokens in the vocabulary.
|
Indices of input sequence tokens in the vocabulary.
|
||||||
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||||
the right rather than the left.
|
the right rather than the left.
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
(see `past` output below). Can be used to speed up sequential decoding.
|
||||||
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -451,9 +351,9 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
|
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
|
||||||
self.drop = nn.Dropout(config.embd_pdrop)
|
self.drop = nn.Dropout(config.embd_pdrop)
|
||||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||||
self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
|
self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
|
||||||
@@ -466,7 +366,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if past is None:
|
if past is None:
|
||||||
past_length = 0
|
past_length = 0
|
||||||
past = [None] * len(self.h)
|
past = [None] * len(self.h)
|
||||||
@@ -476,6 +376,23 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
|
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||||
|
|
||||||
|
# Attention mask.
|
||||||
|
if attention_mask is not None:
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
# attention_probs has shape bsz x n_heads x N x N
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
@@ -513,7 +430,11 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
if self.output_hidden_states:
|
if self.output_hidden_states:
|
||||||
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
||||||
|
|
||||||
outputs = block(hidden_states, layer_past, head_mask[i])
|
outputs = block(hidden_states,
|
||||||
|
layer_past=layer_past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
head_mask=head_mask[i])
|
||||||
|
|
||||||
hidden_states, present = outputs[:2]
|
hidden_states, present = outputs[:2]
|
||||||
presents = presents + (present,)
|
presents = presents + (present,)
|
||||||
|
|
||||||
@@ -568,8 +489,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||||
|
|
||||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, labels=input_ids)
|
outputs = model(input_ids, labels=input_ids)
|
||||||
loss, logits = outputs[:2]
|
loss, logits = outputs[:2]
|
||||||
@@ -580,7 +505,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
self.transformer = GPT2Model(config)
|
self.transformer = GPT2Model(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -590,9 +515,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.wte)
|
self.transformer.wte)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
labels=None):
|
||||||
past=past, head_mask=head_mask)
|
transformer_outputs = self.transformer(input_ids,
|
||||||
|
past=past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
@@ -615,33 +545,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||||
The language modeling head has its weights tied to the input embeddings,
|
The language modeling head has its weights tied to the input embeddings,
|
||||||
the classification head takes as input the input of a specified classification token index in the input sequence).
|
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||||
""", GPT2_START_DOCSTRING)
|
""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
|
||||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||||
r""" Inputs:
|
r"""
|
||||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
|
||||||
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
|
||||||
Index of the classification token in each input sequence.
|
Index of the classification token in each input sequence.
|
||||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**past**:
|
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
|
||||||
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
@@ -676,13 +585,25 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
|
||||||
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||||
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
||||||
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
|
|
||||||
|
# Add a [CLS] to the vocabulary (we should train it also!)
|
||||||
|
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
|
||||||
|
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
|
||||||
|
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
|
||||||
|
|
||||||
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
encoded_choices = [tokenizer.encode(s) for s in choices]
|
||||||
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
|
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
|
||||||
outputs = model(input_ids, mc_token_ids)
|
|
||||||
|
input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
|
||||||
|
mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
|
||||||
|
|
||||||
|
outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||||
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -692,7 +613,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.multiple_choice_head = SequenceSummary(config)
|
self.multiple_choice_head = SequenceSummary(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
""" Make sure we are sharing the input and output embeddings.
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
@@ -701,10 +623,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.wte)
|
self.transformer.wte)
|
||||||
|
|
||||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, past=None, head_mask=None):
|
mc_token_ids=None, lm_labels=None, mc_labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
past=past, head_mask=head_mask)
|
past=past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
|
|||||||
@@ -30,15 +30,13 @@ import torch.nn as nn
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
|
from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
|
||||||
PreTrainedModel, prune_conv1d_layer, SequenceSummary,
|
from .configuration_openai import OpenAIGPTConfig
|
||||||
add_start_docstrings)
|
from .file_utils import add_start_docstrings
|
||||||
from .modeling_bert import BertLayerNorm as LayerNorm
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
|
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
|
||||||
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
|
|
||||||
|
|
||||||
|
|
||||||
def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
||||||
@@ -127,111 +125,6 @@ def swish(x):
|
|||||||
ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
|
ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
|
||||||
|
|
||||||
|
|
||||||
class OpenAIGPTConfig(PretrainedConfig):
|
|
||||||
"""
|
|
||||||
Configuration class to store the configuration of a `OpenAIGPTModel`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
|
||||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
|
||||||
n_positions: Number of positional embeddings.
|
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
afn: The non-linear activation function (function or string) in the
|
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
|
||||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
attn_pdrop: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
|
||||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
|
||||||
"""
|
|
||||||
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
vocab_size_or_config_json_file=40478,
|
|
||||||
n_positions=512,
|
|
||||||
n_ctx=512,
|
|
||||||
n_embd=768,
|
|
||||||
n_layer=12,
|
|
||||||
n_head=12,
|
|
||||||
afn="gelu",
|
|
||||||
resid_pdrop=0.1,
|
|
||||||
embd_pdrop=0.1,
|
|
||||||
attn_pdrop=0.1,
|
|
||||||
layer_norm_epsilon=1e-5,
|
|
||||||
initializer_range=0.02,
|
|
||||||
predict_special_tokens=True,
|
|
||||||
|
|
||||||
num_labels=1,
|
|
||||||
summary_type='cls_index',
|
|
||||||
summary_use_proj=True,
|
|
||||||
summary_activation=None,
|
|
||||||
summary_proj_to_labels=True,
|
|
||||||
summary_first_dropout=0.1,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
"""Constructs OpenAIGPTConfig.
|
|
||||||
"""
|
|
||||||
super(OpenAIGPTConfig, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
|
||||||
self.n_ctx = n_ctx
|
|
||||||
self.n_positions = n_positions
|
|
||||||
self.n_embd = n_embd
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
self.afn = afn
|
|
||||||
self.resid_pdrop = resid_pdrop
|
|
||||||
self.embd_pdrop = embd_pdrop
|
|
||||||
self.attn_pdrop = attn_pdrop
|
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.predict_special_tokens = predict_special_tokens
|
|
||||||
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"First argument must be either a vocabulary size (int)"
|
|
||||||
"or the path to a pretrained model config file (str)"
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def max_position_embeddings(self):
|
|
||||||
return self.n_positions
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hidden_size(self):
|
|
||||||
return self.n_embd
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_attention_heads(self):
|
|
||||||
return self.n_head
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_hidden_layers(self):
|
|
||||||
return self.n_layer
|
|
||||||
|
|
||||||
|
|
||||||
class Attention(nn.Module):
|
class Attention(nn.Module):
|
||||||
def __init__(self, nx, n_ctx, config, scale=False):
|
def __init__(self, nx, n_ctx, config, scale=False):
|
||||||
super(Attention, self).__init__()
|
super(Attention, self).__init__()
|
||||||
@@ -249,12 +142,15 @@ class Attention(nn.Module):
|
|||||||
self.c_proj = Conv1D(n_state, nx)
|
self.c_proj = Conv1D(n_state, nx)
|
||||||
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
||||||
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
def prune_heads(self, heads):
|
def prune_heads(self, heads):
|
||||||
if len(heads) == 0:
|
if len(heads) == 0:
|
||||||
return
|
return
|
||||||
mask = torch.ones(self.n_head, self.split_size // self.n_head)
|
mask = torch.ones(self.n_head, self.split_size // self.n_head)
|
||||||
|
heads = set(heads) - self.pruned_heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
|
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
mask[head] = 0
|
mask[head] = 0
|
||||||
mask = mask.view(-1).contiguous().eq(1)
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
index = torch.arange(len(mask))[mask].long()
|
index = torch.arange(len(mask))[mask].long()
|
||||||
@@ -265,8 +161,9 @@ class Attention(nn.Module):
|
|||||||
# Update hyper params
|
# Update hyper params
|
||||||
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
|
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
|
||||||
self.n_head = self.n_head - len(heads)
|
self.n_head = self.n_head - len(heads)
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def _attn(self, q, k, v, head_mask=None):
|
def _attn(self, q, k, v, attention_mask=None, head_mask=None):
|
||||||
w = torch.matmul(q, k)
|
w = torch.matmul(q, k)
|
||||||
if self.scale:
|
if self.scale:
|
||||||
w = w / math.sqrt(v.size(-1))
|
w = w / math.sqrt(v.size(-1))
|
||||||
@@ -275,6 +172,10 @@ class Attention(nn.Module):
|
|||||||
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
|
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
|
||||||
w = w * b + -1e9 * (1 - b)
|
w = w * b + -1e9 * (1 - b)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask
|
||||||
|
w = w + attention_mask
|
||||||
|
|
||||||
w = nn.Softmax(dim=-1)(w)
|
w = nn.Softmax(dim=-1)(w)
|
||||||
w = self.attn_dropout(w)
|
w = self.attn_dropout(w)
|
||||||
|
|
||||||
@@ -300,14 +201,14 @@ class Attention(nn.Module):
|
|||||||
else:
|
else:
|
||||||
return x.permute(0, 2, 1, 3)
|
return x.permute(0, 2, 1, 3)
|
||||||
|
|
||||||
def forward(self, x, head_mask=None):
|
def forward(self, x, attention_mask=None, head_mask=None):
|
||||||
x = self.c_attn(x)
|
x = self.c_attn(x)
|
||||||
query, key, value = x.split(self.split_size, dim=2)
|
query, key, value = x.split(self.split_size, dim=2)
|
||||||
query = self.split_heads(query)
|
query = self.split_heads(query)
|
||||||
key = self.split_heads(key, k=True)
|
key = self.split_heads(key, k=True)
|
||||||
value = self.split_heads(value)
|
value = self.split_heads(value)
|
||||||
|
|
||||||
attn_outputs = self._attn(query, key, value, head_mask)
|
attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
|
||||||
a = attn_outputs[0]
|
a = attn_outputs[0]
|
||||||
|
|
||||||
a = self.merge_heads(a)
|
a = self.merge_heads(a)
|
||||||
@@ -338,12 +239,12 @@ class Block(nn.Module):
|
|||||||
super(Block, self).__init__()
|
super(Block, self).__init__()
|
||||||
nx = config.n_embd
|
nx = config.n_embd
|
||||||
self.attn = Attention(nx, n_ctx, config, scale)
|
self.attn = Attention(nx, n_ctx, config, scale)
|
||||||
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
self.mlp = MLP(4 * nx, config)
|
self.mlp = MLP(4 * nx, config)
|
||||||
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
|
|
||||||
def forward(self, x, head_mask=None):
|
def forward(self, x, attention_mask=None, head_mask=None):
|
||||||
attn_outputs = self.attn(x, head_mask=head_mask)
|
attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask)
|
||||||
a = attn_outputs[0]
|
a = attn_outputs[0]
|
||||||
|
|
||||||
n = self.ln_1(x + a)
|
n = self.ln_1(x + a)
|
||||||
@@ -363,10 +264,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
|||||||
load_tf_weights = load_tf_weights_in_openai_gpt
|
load_tf_weights = load_tf_weights_in_openai_gpt
|
||||||
base_model_prefix = "transformer"
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
def __init__(self, *inputs, **kwargs):
|
def _init_weights(self, module):
|
||||||
super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
|
|
||||||
|
|
||||||
def init_weights(self, module):
|
|
||||||
""" Initialize the weights.
|
""" Initialize the weights.
|
||||||
"""
|
"""
|
||||||
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
|
||||||
@@ -375,7 +273,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
|||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
|
if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
elif isinstance(module, LayerNorm):
|
elif isinstance(module, nn.LayerNorm):
|
||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
@@ -409,17 +307,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -460,7 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
self.drop = nn.Dropout(config.embd_pdrop)
|
self.drop = nn.Dropout(config.embd_pdrop)
|
||||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
|
self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
|
||||||
@@ -473,7 +371,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
# This was used when we had a single embedding matrice from position and token embeddings
|
# This was used when we had a single embedding matrice from position and token embeddings
|
||||||
# start = self.config.vocab_size + self.config.n_special
|
# start = self.config.vocab_size + self.config.n_special
|
||||||
@@ -482,6 +380,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
|
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||||
|
|
||||||
|
# Attention mask.
|
||||||
|
if attention_mask is not None:
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
# attention_probs has shape bsz x n_heads x N x N
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
@@ -518,7 +433,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
if self.output_hidden_states:
|
if self.output_hidden_states:
|
||||||
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
||||||
|
|
||||||
outputs = block(hidden_states, head_mask[i])
|
outputs = block(hidden_states, attention_mask, head_mask[i])
|
||||||
hidden_states = outputs[0]
|
hidden_states = outputs[0]
|
||||||
if self.output_attentions:
|
if self.output_attentions:
|
||||||
all_attentions = all_attentions + (outputs[1],)
|
all_attentions = all_attentions + (outputs[1],)
|
||||||
@@ -573,7 +488,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
self.transformer = OpenAIGPTModel(config)
|
self.transformer = OpenAIGPTModel(config)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -583,8 +498,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.tokens_embed)
|
self.transformer.tokens_embed)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
labels=None):
|
||||||
|
transformer_outputs = self.transformer(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
@@ -607,40 +526,19 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||||
The language modeling head has its weights tied to the input embeddings,
|
The language modeling head has its weights tied to the input embeddings,
|
||||||
the classification head takes as input the input of a specified classification token index in the input sequence).
|
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||||
""", OPENAI_GPT_START_DOCSTRING)
|
""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
|
||||||
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||||
r""" Inputs:
|
r"""
|
||||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
|
||||||
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
|
||||||
Index of the classification token in each input sequence.
|
Index of the classification token in each input sequence.
|
||||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Mask to avoid performing attention on padding token indices.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
|
||||||
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-1`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
**multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
||||||
Labels for computing the multiple choice classification loss.
|
Labels for computing the multiple choice classification loss.
|
||||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||||
of the input tensors. (see `input_ids` above)
|
of the input tensors. (see `input_ids` above)
|
||||||
@@ -673,7 +571,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
|
||||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||||
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
|
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
|
||||||
outputs = model(input_ids, mc_token_ids)
|
outputs = model(input_ids, mc_token_ids=mc_token_ids)
|
||||||
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -684,7 +582,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.multiple_choice_head = SequenceSummary(config)
|
self.multiple_choice_head = SequenceSummary(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -694,9 +592,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.tokens_embed)
|
self.transformer.tokens_embed)
|
||||||
|
|
||||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, head_mask=None):
|
mc_token_ids=None, lm_labels=None, mc_labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
|||||||
@@ -22,14 +22,11 @@ import logging
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
|
||||||
from torch.nn import CrossEntropyLoss, MSELoss
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
|
||||||
from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings,
|
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
|
||||||
BertLayerNorm, BertModel,
|
from .configuration_roberta import RobertaConfig
|
||||||
BertPreTrainedModel, gelu)
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
from pytorch_transformers.modeling_utils import add_start_docstrings
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -39,13 +36,6 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
|
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
|
|
||||||
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
||||||
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
|
|
||||||
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
|
|
||||||
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class RobertaEmbeddings(BertEmbeddings):
|
class RobertaEmbeddings(BertEmbeddings):
|
||||||
"""
|
"""
|
||||||
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
|
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
|
||||||
@@ -61,11 +51,9 @@ class RobertaEmbeddings(BertEmbeddings):
|
|||||||
# cf. fairseq's `utils.make_positions`
|
# cf. fairseq's `utils.make_positions`
|
||||||
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
|
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||||
return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
|
return super(RobertaEmbeddings, self).forward(input_ids,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids)
|
||||||
class RobertaConfig(BertConfig):
|
|
||||||
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
|
|
||||||
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
||||||
@@ -116,13 +104,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Optional segment token indices to indicate first and second portions of the inputs.
|
||||||
|
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
|
||||||
|
during finetuning.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -168,14 +163,18 @@ class RobertaModel(BertModel):
|
|||||||
super(RobertaModel, self).__init__(config)
|
super(RobertaModel, self).__init__(config)
|
||||||
|
|
||||||
self.embeddings = RobertaEmbeddings(config)
|
self.embeddings = RobertaEmbeddings(config)
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if input_ids[:, 0].sum().item() != 0:
|
if input_ids[:, 0].sum().item() != 0:
|
||||||
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
|
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
|
||||||
"This model requires special tokens in order to work. "
|
"This model requires special tokens in order to work. "
|
||||||
"Please specify add_special_tokens=True in your encoding.")
|
"Please specify add_special_tokens=True in your encoding.")
|
||||||
return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask)
|
return super(RobertaModel, self).forward(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
|
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
|
||||||
@@ -220,7 +219,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
|||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
self.lm_head = RobertaLMHead(config)
|
self.lm_head = RobertaLMHead(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -229,10 +228,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
|
self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
head_mask=None):
|
masked_lm_labels=None):
|
||||||
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
outputs = self.roberta(input_ids,
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
prediction_scores = self.lm_head(sequence_output)
|
prediction_scores = self.lm_head(sequence_output)
|
||||||
|
|
||||||
@@ -313,10 +315,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
|||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
self.classifier = RobertaClassificationHead(config)
|
self.classifier = RobertaClassificationHead(config)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, head_mask=None):
|
labels=None):
|
||||||
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
outputs = self.roberta(input_ids,
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
logits = self.classifier(sequence_output)
|
logits = self.classifier(sequence_output)
|
||||||
|
|
||||||
|
|||||||
@@ -34,18 +34,16 @@ import torch.nn.functional as F
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from .modeling_bert import BertLayerNorm as LayerNorm
|
from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
|
||||||
|
from .configuration_transfo_xl import TransfoXLConfig
|
||||||
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
|
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
|
||||||
from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings)
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||||
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
|
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
||||||
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
def build_tf_to_pytorch_map(model, config):
|
def build_tf_to_pytorch_map(model, config):
|
||||||
""" A map of modules from TF to PyTorch.
|
""" A map of modules from TF to PyTorch.
|
||||||
@@ -175,143 +173,6 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
class TransfoXLConfig(PretrainedConfig):
|
|
||||||
"""Configuration class to store the configuration of a `TransfoXLModel`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
|
|
||||||
cutoffs: cutoffs for the adaptive softmax
|
|
||||||
d_model: Dimensionality of the model's hidden states.
|
|
||||||
d_embed: Dimensionality of the embeddings
|
|
||||||
d_head: Dimensionality of the model's heads.
|
|
||||||
div_val: divident value for adapative input and softmax
|
|
||||||
pre_lnorm: apply LayerNorm to the input instead of the output
|
|
||||||
d_inner: Inner dimension in FF
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
tgt_len: number of tokens to predict
|
|
||||||
ext_len: length of the extended context
|
|
||||||
mem_len: length of the retained previous heads
|
|
||||||
same_length: use the same attn length for all tokens
|
|
||||||
proj_share_all_but_first: True to share all but first projs, False not to share.
|
|
||||||
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
|
|
||||||
clamp_len: use the same pos embeddings after clamp_len
|
|
||||||
sample_softmax: number of samples in sampled softmax
|
|
||||||
adaptive: use adaptive softmax
|
|
||||||
tie_weight: tie the word embedding and softmax weights
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
dropatt: The dropout ratio for the attention probabilities.
|
|
||||||
untie_r: untie relative position biases
|
|
||||||
embd_pdrop: The dropout ratio for the embeddings.
|
|
||||||
init: parameter initializer to use
|
|
||||||
init_range: parameters initialized by U(-init_range, init_range).
|
|
||||||
proj_init_std: parameters initialized by N(0, init_std)
|
|
||||||
init_std: parameters initialized by N(0, init_std)
|
|
||||||
"""
|
|
||||||
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
vocab_size_or_config_json_file=267735,
|
|
||||||
cutoffs=[20000, 40000, 200000],
|
|
||||||
d_model=1024,
|
|
||||||
d_embed=1024,
|
|
||||||
n_head=16,
|
|
||||||
d_head=64,
|
|
||||||
d_inner=4096,
|
|
||||||
div_val=4,
|
|
||||||
pre_lnorm=False,
|
|
||||||
n_layer=18,
|
|
||||||
tgt_len=128,
|
|
||||||
ext_len=0,
|
|
||||||
mem_len=1600,
|
|
||||||
clamp_len=1000,
|
|
||||||
same_length=True,
|
|
||||||
proj_share_all_but_first=True,
|
|
||||||
attn_type=0,
|
|
||||||
sample_softmax=-1,
|
|
||||||
adaptive=True,
|
|
||||||
tie_weight=True,
|
|
||||||
dropout=0.1,
|
|
||||||
dropatt=0.0,
|
|
||||||
untie_r=True,
|
|
||||||
init="normal",
|
|
||||||
init_range=0.01,
|
|
||||||
proj_init_std=0.01,
|
|
||||||
init_std=0.02,
|
|
||||||
**kwargs):
|
|
||||||
"""Constructs TransfoXLConfig.
|
|
||||||
"""
|
|
||||||
super(TransfoXLConfig, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.n_token = vocab_size_or_config_json_file
|
|
||||||
self.cutoffs = []
|
|
||||||
self.cutoffs.extend(cutoffs)
|
|
||||||
self.tie_weight = tie_weight
|
|
||||||
if proj_share_all_but_first:
|
|
||||||
self.tie_projs = [False] + [True] * len(self.cutoffs)
|
|
||||||
else:
|
|
||||||
self.tie_projs = [False] + [False] * len(self.cutoffs)
|
|
||||||
self.d_model = d_model
|
|
||||||
self.d_embed = d_embed
|
|
||||||
self.d_head = d_head
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.div_val = div_val
|
|
||||||
self.pre_lnorm = pre_lnorm
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
self.tgt_len = tgt_len
|
|
||||||
self.ext_len = ext_len
|
|
||||||
self.mem_len = mem_len
|
|
||||||
self.same_length = same_length
|
|
||||||
self.attn_type = attn_type
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.sample_softmax = sample_softmax
|
|
||||||
self.adaptive = adaptive
|
|
||||||
self.dropout = dropout
|
|
||||||
self.dropatt = dropatt
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.init = init
|
|
||||||
self.init_range = init_range
|
|
||||||
self.proj_init_std = proj_init_std
|
|
||||||
self.init_std = init_std
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def max_position_embeddings(self):
|
|
||||||
return self.tgt_len + self.ext_len + self.mem_len
|
|
||||||
|
|
||||||
@property
|
|
||||||
def vocab_size(self):
|
|
||||||
return self.n_token
|
|
||||||
|
|
||||||
@vocab_size.setter
|
|
||||||
def vocab_size(self, value):
|
|
||||||
self.n_token = value
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hidden_size(self):
|
|
||||||
return self.d_model
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_attention_heads(self):
|
|
||||||
return self.n_head
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_hidden_layers(self):
|
|
||||||
return self.n_layer
|
|
||||||
|
|
||||||
|
|
||||||
class PositionalEmbedding(nn.Module):
|
class PositionalEmbedding(nn.Module):
|
||||||
def __init__(self, demb):
|
def __init__(self, demb):
|
||||||
super(PositionalEmbedding, self).__init__()
|
super(PositionalEmbedding, self).__init__()
|
||||||
@@ -347,7 +208,7 @@ class PositionwiseFF(nn.Module):
|
|||||||
nn.Dropout(dropout),
|
nn.Dropout(dropout),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.layer_norm = LayerNorm(d_model)
|
self.layer_norm = nn.LayerNorm(d_model)
|
||||||
|
|
||||||
self.pre_lnorm = pre_lnorm
|
self.pre_lnorm = pre_lnorm
|
||||||
|
|
||||||
@@ -370,7 +231,7 @@ class PositionwiseFF(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class MultiHeadAttn(nn.Module):
|
class MultiHeadAttn(nn.Module):
|
||||||
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
|
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
|
||||||
pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
|
pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
|
||||||
super(MultiHeadAttn, self).__init__()
|
super(MultiHeadAttn, self).__init__()
|
||||||
|
|
||||||
@@ -387,7 +248,7 @@ class MultiHeadAttn(nn.Module):
|
|||||||
self.dropatt = nn.Dropout(dropatt)
|
self.dropatt = nn.Dropout(dropatt)
|
||||||
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
|
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
|
||||||
|
|
||||||
self.layer_norm = LayerNorm(d_model)
|
self.layer_norm = nn.LayerNorm(d_model)
|
||||||
|
|
||||||
self.scale = 1 / (d_head ** 0.5)
|
self.scale = 1 / (d_head ** 0.5)
|
||||||
|
|
||||||
@@ -423,7 +284,8 @@ class MultiHeadAttn(nn.Module):
|
|||||||
# [qlen x klen x bsz x n_head]
|
# [qlen x klen x bsz x n_head]
|
||||||
attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
|
attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
|
||||||
attn_score.mul_(self.scale)
|
attn_score.mul_(self.scale)
|
||||||
if attn_mask is not None and attn_mask.any().item():
|
if attn_mask is not None and torch.sum(attn_mask).item():
|
||||||
|
attn_mask = (attn_mask == 1) # Switch to bool
|
||||||
if attn_mask.dim() == 2:
|
if attn_mask.dim() == 2:
|
||||||
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
|
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
|
||||||
elif attn_mask.dim() == 3:
|
elif attn_mask.dim() == 3:
|
||||||
@@ -476,7 +338,7 @@ class RelMultiHeadAttn(nn.Module):
|
|||||||
self.dropatt = nn.Dropout(dropatt)
|
self.dropatt = nn.Dropout(dropatt)
|
||||||
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
|
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
|
||||||
|
|
||||||
self.layer_norm = LayerNorm(d_model)
|
self.layer_norm = nn.LayerNorm(d_model)
|
||||||
|
|
||||||
self.scale = 1 / (d_head ** 0.5)
|
self.scale = 1 / (d_head ** 0.5)
|
||||||
|
|
||||||
@@ -586,13 +448,22 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
|
|||||||
attn_score.mul_(self.scale)
|
attn_score.mul_(self.scale)
|
||||||
|
|
||||||
#### compute attention probability
|
#### compute attention probability
|
||||||
if attn_mask is not None and attn_mask.any().item():
|
if attn_mask is not None and torch.sum(attn_mask).item():
|
||||||
|
attn_mask = (attn_mask == 1) # Switch to bool
|
||||||
if attn_mask.dim() == 2:
|
if attn_mask.dim() == 2:
|
||||||
attn_score = attn_score.float().masked_fill(
|
if next(self.parameters()).dtype == torch.float16:
|
||||||
attn_mask[None,:,:,None], -1e30).type_as(attn_score)
|
attn_score = attn_score.float().masked_fill(
|
||||||
|
attn_mask[None,:,:,None], -65000).type_as(attn_score)
|
||||||
|
else:
|
||||||
|
attn_score = attn_score.float().masked_fill(
|
||||||
|
attn_mask[None,:,:,None], -1e30).type_as(attn_score)
|
||||||
elif attn_mask.dim() == 3:
|
elif attn_mask.dim() == 3:
|
||||||
attn_score = attn_score.float().masked_fill(
|
if next(self.parameters()).dtype == torch.float16:
|
||||||
attn_mask[:,:,:,None], -1e30).type_as(attn_score)
|
attn_score = attn_score.float().masked_fill(
|
||||||
|
attn_mask[:,:,:,None], -65000).type_as(attn_score)
|
||||||
|
else:
|
||||||
|
attn_score = attn_score.float().masked_fill(
|
||||||
|
attn_mask[:,:,:,None], -1e30).type_as(attn_score)
|
||||||
|
|
||||||
# [qlen x klen x bsz x n_head]
|
# [qlen x klen x bsz x n_head]
|
||||||
attn_prob = F.softmax(attn_score, dim=1)
|
attn_prob = F.softmax(attn_score, dim=1)
|
||||||
@@ -680,7 +551,8 @@ class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
|
|||||||
attn_score.mul_(self.scale)
|
attn_score.mul_(self.scale)
|
||||||
|
|
||||||
#### compute attention probability
|
#### compute attention probability
|
||||||
if attn_mask is not None and attn_mask.any().item():
|
if attn_mask is not None and torch.sum(attn_mask).item():
|
||||||
|
attn_mask = (attn_mask == 1) # Switch to bool
|
||||||
if attn_mask.dim() == 2:
|
if attn_mask.dim() == 2:
|
||||||
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
|
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
|
||||||
elif attn_mask.dim() == 3:
|
elif attn_mask.dim() == 3:
|
||||||
@@ -723,7 +595,7 @@ class DecoderLayer(nn.Module):
|
|||||||
super(DecoderLayer, self).__init__()
|
super(DecoderLayer, self).__init__()
|
||||||
|
|
||||||
self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
|
self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
|
||||||
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
|
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
|
||||||
pre_lnorm=kwargs.get('pre_lnorm'))
|
pre_lnorm=kwargs.get('pre_lnorm'))
|
||||||
|
|
||||||
def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
|
def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
|
||||||
@@ -743,7 +615,7 @@ class RelLearnableDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
|
self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
|
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
|
||||||
pre_lnorm=kwargs.get('pre_lnorm'))
|
pre_lnorm=kwargs.get('pre_lnorm'))
|
||||||
|
|
||||||
def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
|
def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
|
||||||
@@ -764,7 +636,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
|
self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
|
||||||
d_head, dropout, **kwargs)
|
d_head, dropout, **kwargs)
|
||||||
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
|
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
|
||||||
pre_lnorm=kwargs.get('pre_lnorm'))
|
pre_lnorm=kwargs.get('pre_lnorm'))
|
||||||
|
|
||||||
def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
|
def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
|
||||||
@@ -781,7 +653,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class AdaptiveEmbedding(nn.Module):
|
class AdaptiveEmbedding(nn.Module):
|
||||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
|
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
|
||||||
sample_softmax=False):
|
sample_softmax=False):
|
||||||
super(AdaptiveEmbedding, self).__init__()
|
super(AdaptiveEmbedding, self).__init__()
|
||||||
|
|
||||||
@@ -819,7 +691,7 @@ class AdaptiveEmbedding(nn.Module):
|
|||||||
else:
|
else:
|
||||||
param = next(self.parameters())
|
param = next(self.parameters())
|
||||||
inp_flat = inp.view(-1)
|
inp_flat = inp.view(-1)
|
||||||
emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
|
emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
|
||||||
dtype=param.dtype, device=param.device)
|
dtype=param.dtype, device=param.device)
|
||||||
for i in range(len(self.cutoffs)):
|
for i in range(len(self.cutoffs)):
|
||||||
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
|
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
|
||||||
@@ -853,9 +725,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
|
|||||||
load_tf_weights = load_tf_weights_in_transfo_xl
|
load_tf_weights = load_tf_weights_in_transfo_xl
|
||||||
base_model_prefix = "transformer"
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
def __init__(self, *inputs, **kwargs):
|
|
||||||
super(TransfoXLPreTrainedModel, self).__init__(*inputs, **kwargs)
|
|
||||||
|
|
||||||
def _init_weight(self, weight):
|
def _init_weight(self, weight):
|
||||||
if self.config.init == 'uniform':
|
if self.config.init == 'uniform':
|
||||||
nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
|
nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
|
||||||
@@ -865,7 +734,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
|
|||||||
def _init_bias(self, bias):
|
def _init_bias(self, bias):
|
||||||
nn.init.constant_(bias, 0.0)
|
nn.init.constant_(bias, 0.0)
|
||||||
|
|
||||||
def init_weights(self, m):
|
def _init_weights(self, m):
|
||||||
""" Initialize the weights.
|
""" Initialize the weights.
|
||||||
"""
|
"""
|
||||||
classname = m.__class__.__name__
|
classname = m.__class__.__name__
|
||||||
@@ -991,7 +860,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
self.n_head = config.n_head
|
self.n_head = config.n_head
|
||||||
self.d_head = config.d_head
|
self.d_head = config.d_head
|
||||||
|
|
||||||
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
|
||||||
div_val=config.div_val)
|
div_val=config.div_val)
|
||||||
|
|
||||||
self.drop = nn.Dropout(config.dropout)
|
self.drop = nn.Dropout(config.dropout)
|
||||||
@@ -1059,7 +928,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
self.r_emb = nn.Parameter(torch.FloatTensor(
|
self.r_emb = nn.Parameter(torch.FloatTensor(
|
||||||
self.n_layer, self.max_klen, self.n_head, self.d_head))
|
self.n_layer, self.max_klen, self.n_head, self.d_head))
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
return self.word_emb
|
return self.word_emb
|
||||||
@@ -1135,22 +1004,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
mlen = mems[0].size(0) if mems is not None else 0
|
mlen = mems[0].size(0) if mems is not None else 0
|
||||||
klen = mlen + qlen
|
klen = mlen + qlen
|
||||||
if self.same_length:
|
if self.same_length:
|
||||||
all_ones = word_emb.new_ones(qlen, klen)
|
all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
|
||||||
mask_len = klen - self.mem_len
|
mask_len = klen - self.mem_len
|
||||||
if mask_len > 0:
|
if mask_len > 0:
|
||||||
mask_shift_len = qlen - mask_len
|
mask_shift_len = qlen - mask_len
|
||||||
else:
|
else:
|
||||||
mask_shift_len = qlen
|
mask_shift_len = qlen
|
||||||
dec_attn_mask = (torch.triu(all_ones, 1+mlen)
|
dec_attn_mask = (torch.triu(all_ones, 1+mlen)
|
||||||
+ torch.tril(all_ones, -mask_shift_len)).bool()[:, :, None] # -1
|
+ torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
|
||||||
else:
|
else:
|
||||||
dec_attn_mask = torch.triu(
|
dec_attn_mask = torch.triu(
|
||||||
word_emb.new_ones(qlen, klen), diagonal=1+mlen).bool()[:,:,None]
|
word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
|
||||||
|
|
||||||
hids = []
|
hids = []
|
||||||
attentions = []
|
attentions = []
|
||||||
if self.attn_type == 0: # default
|
if self.attn_type == 0: # default
|
||||||
pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
|
pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
|
||||||
dtype=word_emb.dtype)
|
dtype=word_emb.dtype)
|
||||||
if self.clamp_len > 0:
|
if self.clamp_len > 0:
|
||||||
pos_seq.clamp_(max=self.clamp_len)
|
pos_seq.clamp_(max=self.clamp_len)
|
||||||
@@ -1304,9 +1173,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
|||||||
self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
|
self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
|
||||||
# use adaptive softmax (including standard softmax)
|
# use adaptive softmax (including standard softmax)
|
||||||
else:
|
else:
|
||||||
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
|
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
|
||||||
config.cutoffs, div_val=config.div_val)
|
config.cutoffs, div_val=config.div_val)
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -1342,7 +1211,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
|||||||
def init_mems(self, data):
|
def init_mems(self, data):
|
||||||
return self.transformer.init_mems(data)
|
return self.transformer.init_mems(data)
|
||||||
|
|
||||||
def forward(self, input_ids, labels=None, mems=None, head_mask=None):
|
def forward(self, input_ids, mems=None, head_mask=None, labels=None):
|
||||||
bsz = input_ids.size(0)
|
bsz = input_ids.size(0)
|
||||||
tgt_len = input_ids.size(1)
|
tgt_len = input_ids.size(1)
|
||||||
|
|
||||||
|
|||||||
@@ -30,14 +30,11 @@ from torch import nn
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from .file_utils import cached_path
|
from .configuration_utils import PretrainedConfig
|
||||||
|
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
CONFIG_NAME = "config.json"
|
|
||||||
WEIGHTS_NAME = "pytorch_model.bin"
|
|
||||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from torch.nn import Identity
|
from torch.nn import Identity
|
||||||
@@ -52,194 +49,6 @@ except ImportError:
|
|||||||
def forward(self, input):
|
def forward(self, input):
|
||||||
return input
|
return input
|
||||||
|
|
||||||
|
|
||||||
if not six.PY2:
|
|
||||||
def add_start_docstrings(*docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
fn.__doc__ = ''.join(docstr) + fn.__doc__
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
else:
|
|
||||||
# Not possible to update class docstrings on python2
|
|
||||||
def add_start_docstrings(*docstr):
|
|
||||||
def docstring_decorator(fn):
|
|
||||||
return fn
|
|
||||||
return docstring_decorator
|
|
||||||
|
|
||||||
|
|
||||||
class PretrainedConfig(object):
|
|
||||||
r""" Base class for all configuration classes.
|
|
||||||
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
|
|
||||||
|
|
||||||
Note:
|
|
||||||
A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
|
|
||||||
It only affects the model's configuration.
|
|
||||||
|
|
||||||
Class attributes (overridden by derived classes):
|
|
||||||
- ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
|
|
||||||
``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
|
|
||||||
``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
|
|
||||||
``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
|
|
||||||
``torchscript``: string, default `False`. Is the model used with Torchscript.
|
|
||||||
"""
|
|
||||||
pretrained_config_archive_map = {}
|
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
self.finetuning_task = kwargs.pop('finetuning_task', None)
|
|
||||||
self.num_labels = kwargs.pop('num_labels', 2)
|
|
||||||
self.output_attentions = kwargs.pop('output_attentions', False)
|
|
||||||
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
|
|
||||||
self.torchscript = kwargs.pop('torchscript', False)
|
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
|
||||||
""" Save a configuration object to the directory `save_directory`, so that it
|
|
||||||
can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
|
|
||||||
"""
|
|
||||||
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
|
||||||
|
|
||||||
# If we save using the predefined names, we can load using `from_pretrained`
|
|
||||||
output_config_file = os.path.join(save_directory, CONFIG_NAME)
|
|
||||||
|
|
||||||
self.to_json_file(output_config_file)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
||||||
r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
|
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
|
||||||
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
|
||||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
|
||||||
|
|
||||||
cache_dir: (`optional`) string:
|
|
||||||
Path to a directory in which a downloaded pre-trained model
|
|
||||||
configuration should be cached if the standard cache should not be used.
|
|
||||||
|
|
||||||
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
|
|
||||||
|
|
||||||
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
|
||||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
|
||||||
|
|
||||||
force_download: (`optional`) boolean, default False:
|
|
||||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
|
||||||
The proxies are used on each request.
|
|
||||||
|
|
||||||
return_unused_kwargs: (`optional`) bool:
|
|
||||||
|
|
||||||
- If False, then this function returns just the final configuration object.
|
|
||||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
|
||||||
|
|
||||||
Examples::
|
|
||||||
|
|
||||||
# We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
|
|
||||||
# derived class: BertConfig
|
|
||||||
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
|
||||||
config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
|
||||||
config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
|
|
||||||
config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
|
||||||
assert config.output_attention == True
|
|
||||||
config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
|
|
||||||
foo=False, return_unused_kwargs=True)
|
|
||||||
assert config.output_attention == True
|
|
||||||
assert unused_kwargs == {'foo': False}
|
|
||||||
|
|
||||||
"""
|
|
||||||
cache_dir = kwargs.pop('cache_dir', None)
|
|
||||||
force_download = kwargs.pop('force_download', False)
|
|
||||||
proxies = kwargs.pop('proxies', None)
|
|
||||||
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
|
||||||
|
|
||||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
|
||||||
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
|
|
||||||
elif os.path.isdir(pretrained_model_name_or_path):
|
|
||||||
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
|
||||||
else:
|
|
||||||
config_file = pretrained_model_name_or_path
|
|
||||||
# redirect to the cache, if necessary
|
|
||||||
try:
|
|
||||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
|
||||||
except EnvironmentError as e:
|
|
||||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
|
||||||
logger.error(
|
|
||||||
"Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
|
||||||
config_file))
|
|
||||||
else:
|
|
||||||
logger.error(
|
|
||||||
"Model name '{}' was not found in model name list ({}). "
|
|
||||||
"We assumed '{}' was a path or url but couldn't find any file "
|
|
||||||
"associated to this path or url.".format(
|
|
||||||
pretrained_model_name_or_path,
|
|
||||||
', '.join(cls.pretrained_config_archive_map.keys()),
|
|
||||||
config_file))
|
|
||||||
raise e
|
|
||||||
if resolved_config_file == config_file:
|
|
||||||
logger.info("loading configuration file {}".format(config_file))
|
|
||||||
else:
|
|
||||||
logger.info("loading configuration file {} from cache at {}".format(
|
|
||||||
config_file, resolved_config_file))
|
|
||||||
|
|
||||||
# Load config
|
|
||||||
config = cls.from_json_file(resolved_config_file)
|
|
||||||
|
|
||||||
# Update config with kwargs if needed
|
|
||||||
to_remove = []
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
if hasattr(config, key):
|
|
||||||
setattr(config, key, value)
|
|
||||||
to_remove.append(key)
|
|
||||||
for key in to_remove:
|
|
||||||
kwargs.pop(key, None)
|
|
||||||
|
|
||||||
logger.info("Model config %s", config)
|
|
||||||
if return_unused_kwargs:
|
|
||||||
return config, kwargs
|
|
||||||
else:
|
|
||||||
return config
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_dict(cls, json_object):
|
|
||||||
"""Constructs a `Config` from a Python dictionary of parameters."""
|
|
||||||
config = cls(vocab_size_or_config_json_file=-1)
|
|
||||||
for key, value in json_object.items():
|
|
||||||
config.__dict__[key] = value
|
|
||||||
return config
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_json_file(cls, json_file):
|
|
||||||
"""Constructs a `BertConfig` from a json file of parameters."""
|
|
||||||
with open(json_file, "r", encoding='utf-8') as reader:
|
|
||||||
text = reader.read()
|
|
||||||
return cls.from_dict(json.loads(text))
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
|
||||||
return self.__dict__ == other.__dict__
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return str(self.to_json_string())
|
|
||||||
|
|
||||||
def to_dict(self):
|
|
||||||
"""Serializes this instance to a Python dictionary."""
|
|
||||||
output = copy.deepcopy(self.__dict__)
|
|
||||||
return output
|
|
||||||
|
|
||||||
def to_json_string(self):
|
|
||||||
"""Serializes this instance to a JSON string."""
|
|
||||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
|
||||||
|
|
||||||
def to_json_file(self, json_file_path):
|
|
||||||
""" Save this instance to a json file."""
|
|
||||||
with open(json_file_path, "w", encoding='utf-8') as writer:
|
|
||||||
writer.write(self.to_json_string())
|
|
||||||
|
|
||||||
|
|
||||||
class PreTrainedModel(nn.Module):
|
class PreTrainedModel(nn.Module):
|
||||||
r""" Base class for all models.
|
r""" Base class for all models.
|
||||||
|
|
||||||
@@ -300,7 +109,7 @@ class PreTrainedModel(nn.Module):
|
|||||||
new_embeddings.to(old_embeddings.weight.device)
|
new_embeddings.to(old_embeddings.weight.device)
|
||||||
|
|
||||||
# initialize all new embeddings (in particular added tokens)
|
# initialize all new embeddings (in particular added tokens)
|
||||||
self.init_weights(new_embeddings)
|
self._init_weights(new_embeddings)
|
||||||
|
|
||||||
# Copy word embeddings from the previous weights
|
# Copy word embeddings from the previous weights
|
||||||
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
||||||
@@ -316,6 +125,14 @@ class PreTrainedModel(nn.Module):
|
|||||||
else:
|
else:
|
||||||
first_module.weight = second_module.weight
|
first_module.weight = second_module.weight
|
||||||
|
|
||||||
|
if hasattr(first_module, 'bias') and first_module.bias is not None:
|
||||||
|
first_module.bias.data = torch.nn.functional.pad(
|
||||||
|
first_module.bias.data,
|
||||||
|
(0, first_module.weight.shape[0] - first_module.bias.shape[0]),
|
||||||
|
'constant',
|
||||||
|
0
|
||||||
|
)
|
||||||
|
|
||||||
def resize_token_embeddings(self, new_num_tokens=None):
|
def resize_token_embeddings(self, new_num_tokens=None):
|
||||||
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
|
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
|
||||||
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
||||||
@@ -323,7 +140,7 @@ class PreTrainedModel(nn.Module):
|
|||||||
Arguments:
|
Arguments:
|
||||||
|
|
||||||
new_num_tokens: (`optional`) int:
|
new_num_tokens: (`optional`) int:
|
||||||
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
|
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
|
||||||
If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
|
If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
|
||||||
|
|
||||||
Return: ``torch.nn.Embeddings``
|
Return: ``torch.nn.Embeddings``
|
||||||
@@ -344,14 +161,30 @@ class PreTrainedModel(nn.Module):
|
|||||||
|
|
||||||
return model_embeds
|
return model_embeds
|
||||||
|
|
||||||
|
def init_weights(self):
|
||||||
|
""" Initialize and prunes weights if needed. """
|
||||||
|
# Initialize weights
|
||||||
|
self.apply(self._init_weights)
|
||||||
|
|
||||||
|
# Prune heads if needed
|
||||||
|
if self.config.pruned_heads:
|
||||||
|
self.prune_heads(self.config.pruned_heads)
|
||||||
|
|
||||||
def prune_heads(self, heads_to_prune):
|
def prune_heads(self, heads_to_prune):
|
||||||
""" Prunes heads of the base model.
|
""" Prunes heads of the base model.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
|
||||||
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
|
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
|
||||||
|
E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
|
||||||
"""
|
"""
|
||||||
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||||
|
|
||||||
|
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
|
||||||
|
for layer, heads in heads_to_prune.items():
|
||||||
|
union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
|
||||||
|
self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON
|
||||||
|
|
||||||
base_model._prune_heads(heads_to_prune)
|
base_model._prune_heads(heads_to_prune)
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
@@ -601,7 +434,10 @@ class PoolerStartLogits(nn.Module):
|
|||||||
x = self.dense(hidden_states).squeeze(-1)
|
x = self.dense(hidden_states).squeeze(-1)
|
||||||
|
|
||||||
if p_mask is not None:
|
if p_mask is not None:
|
||||||
x = x * (1 - p_mask) - 1e30 * p_mask
|
if next(self.parameters()).dtype == torch.float16:
|
||||||
|
x = x * (1 - p_mask) - 65500 * p_mask
|
||||||
|
else:
|
||||||
|
x = x * (1 - p_mask) - 1e30 * p_mask
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|||||||
@@ -16,11 +16,8 @@
|
|||||||
"""
|
"""
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import sys
|
|
||||||
from io import open
|
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -30,8 +27,9 @@ from torch import nn
|
|||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from torch.nn import CrossEntropyLoss, MSELoss
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
|
||||||
from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings,
|
from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead
|
||||||
prune_linear_layer, SequenceSummary, SQuADHead)
|
from .configuration_xlm import XLMConfig
|
||||||
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -44,161 +42,9 @@ XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
|
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
|
||||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
|
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
|
||||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
|
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
|
||||||
|
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
|
||||||
|
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
||||||
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
|
|
||||||
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
|
|
||||||
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
|
|
||||||
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
|
|
||||||
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
|
|
||||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
|
|
||||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
|
|
||||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class XLMConfig(PretrainedConfig):
|
|
||||||
"""Configuration class to store the configuration of a `XLMModel`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
|
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
|
||||||
layer in the Transformer encoder.
|
|
||||||
ff_activation: The non-linear activation function (function or string) in the
|
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
|
||||||
untie_r: untie relative position biases
|
|
||||||
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
|
|
||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
dropatt: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
max_position_embeddings: The maximum sequence length that this model might
|
|
||||||
ever be used with. Typically set this to something large just in case
|
|
||||||
(e.g., 512 or 1024 or 2048).
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
|
||||||
dropatt: float, dropout rate on attention probabilities.
|
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
|
||||||
init_std: float, initialize the parameters with a normal distribution
|
|
||||||
with mean 0 and stddev init_std. Only effective when init="normal".
|
|
||||||
mem_len: int, the number of tokens to cache.
|
|
||||||
reuse_len: int, the number of tokens in the currect batch to be cached
|
|
||||||
and reused in the future.
|
|
||||||
bi_data: bool, whether to use bidirectional input pipeline.
|
|
||||||
Usually set to True during pretraining and False during finetuning.
|
|
||||||
clamp_len: int, clamp all relative distances larger than clamp_len.
|
|
||||||
-1 means no clamping.
|
|
||||||
same_length: bool, whether to use the same attention length for each token.
|
|
||||||
"""
|
|
||||||
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
vocab_size_or_config_json_file=30145,
|
|
||||||
emb_dim=2048,
|
|
||||||
n_layers=12,
|
|
||||||
n_heads=16,
|
|
||||||
dropout=0.1,
|
|
||||||
attention_dropout=0.1,
|
|
||||||
gelu_activation=True,
|
|
||||||
sinusoidal_embeddings=False,
|
|
||||||
causal=False,
|
|
||||||
asm=False,
|
|
||||||
n_langs=1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
embed_init_std=2048 ** -0.5,
|
|
||||||
layer_norm_eps=1e-12,
|
|
||||||
init_std=0.02,
|
|
||||||
bos_index=0,
|
|
||||||
eos_index=1,
|
|
||||||
pad_index=2,
|
|
||||||
unk_index=3,
|
|
||||||
mask_index=5,
|
|
||||||
is_encoder=True,
|
|
||||||
|
|
||||||
finetuning_task=None,
|
|
||||||
num_labels=2,
|
|
||||||
summary_type='first',
|
|
||||||
summary_use_proj=True,
|
|
||||||
summary_activation=None,
|
|
||||||
summary_proj_to_labels=True,
|
|
||||||
summary_first_dropout=0.1,
|
|
||||||
start_n_top=5,
|
|
||||||
end_n_top=5,
|
|
||||||
**kwargs):
|
|
||||||
"""Constructs XLMConfig.
|
|
||||||
"""
|
|
||||||
super(XLMConfig, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.n_words = vocab_size_or_config_json_file
|
|
||||||
self.emb_dim = emb_dim
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.dropout = dropout
|
|
||||||
self.attention_dropout = attention_dropout
|
|
||||||
self.gelu_activation = gelu_activation
|
|
||||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
|
||||||
self.causal = causal
|
|
||||||
self.asm = asm
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.layer_norm_eps = layer_norm_eps
|
|
||||||
self.bos_index = bos_index
|
|
||||||
self.eos_index = eos_index
|
|
||||||
self.pad_index = pad_index
|
|
||||||
self.unk_index = unk_index
|
|
||||||
self.mask_index = mask_index
|
|
||||||
self.is_encoder = is_encoder
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.embed_init_std = embed_init_std
|
|
||||||
self.init_std = init_std
|
|
||||||
self.finetuning_task = finetuning_task
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
|
||||||
self.summary_first_dropout = summary_first_dropout
|
|
||||||
self.start_n_top = start_n_top
|
|
||||||
self.end_n_top = end_n_top
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def vocab_size(self):
|
|
||||||
return self.n_words
|
|
||||||
|
|
||||||
@vocab_size.setter
|
|
||||||
def vocab_size(self, value):
|
|
||||||
self.n_words = value
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hidden_size(self):
|
|
||||||
return self.emb_dim
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_attention_heads(self):
|
|
||||||
return self.n_heads
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_hidden_layers(self):
|
|
||||||
return self.n_layers
|
|
||||||
|
|
||||||
|
|
||||||
def create_sinusoidal_embeddings(n_pos, dim, out):
|
def create_sinusoidal_embeddings(n_pos, dim, out):
|
||||||
@@ -265,13 +111,16 @@ class MultiHeadAttention(nn.Module):
|
|||||||
self.k_lin = nn.Linear(dim, dim)
|
self.k_lin = nn.Linear(dim, dim)
|
||||||
self.v_lin = nn.Linear(dim, dim)
|
self.v_lin = nn.Linear(dim, dim)
|
||||||
self.out_lin = nn.Linear(dim, dim)
|
self.out_lin = nn.Linear(dim, dim)
|
||||||
|
self.pruned_heads = set()
|
||||||
|
|
||||||
def prune_heads(self, heads):
|
def prune_heads(self, heads):
|
||||||
attention_head_size = self.dim // self.n_heads
|
attention_head_size = self.dim // self.n_heads
|
||||||
if len(heads) == 0:
|
if len(heads) == 0:
|
||||||
return
|
return
|
||||||
mask = torch.ones(self.n_heads, attention_head_size)
|
mask = torch.ones(self.n_heads, attention_head_size)
|
||||||
|
heads = set(heads) - self.pruned_heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
|
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||||
mask[head] = 0
|
mask[head] = 0
|
||||||
mask = mask.view(-1).contiguous().eq(1)
|
mask = mask.view(-1).contiguous().eq(1)
|
||||||
index = torch.arange(len(mask))[mask].long()
|
index = torch.arange(len(mask))[mask].long()
|
||||||
@@ -283,6 +132,7 @@ class MultiHeadAttention(nn.Module):
|
|||||||
# Update hyper params
|
# Update hyper params
|
||||||
self.n_heads = self.n_heads - len(heads)
|
self.n_heads = self.n_heads - len(heads)
|
||||||
self.dim = attention_head_size * self.n_heads
|
self.dim = attention_head_size * self.n_heads
|
||||||
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def forward(self, input, mask, kv=None, cache=None, head_mask=None):
|
def forward(self, input, mask, kv=None, cache=None, head_mask=None):
|
||||||
"""
|
"""
|
||||||
@@ -377,7 +227,7 @@ class XLMPreTrainedModel(PreTrainedModel):
|
|||||||
def __init__(self, *inputs, **kwargs):
|
def __init__(self, *inputs, **kwargs):
|
||||||
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
|
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||||
|
|
||||||
def init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights. """
|
""" Initialize the weights. """
|
||||||
if isinstance(module, nn.Embedding):
|
if isinstance(module, nn.Embedding):
|
||||||
if self.config is not None and self.config.embed_init_std is not None:
|
if self.config is not None and self.config.embed_init_std is not None:
|
||||||
@@ -431,23 +281,23 @@ XLM_INPUTS_DOCSTRING = r"""
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
Mask values selected in ``[0, 1]``:
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
A parallel sequence of tokens to be used to indicate the language of each token in the input.
|
A parallel sequence of tokens to be used to indicate the language of each token in the input.
|
||||||
Indices are languages ids which can be obtained from the language names by using two conversion mappings
|
Indices are languages ids which can be obtained from the language names by using two conversion mappings
|
||||||
provided in the configuration of the model (only provided for multilingual models).
|
provided in the configuration of the model (only provided for multilingual models).
|
||||||
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
|
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
|
||||||
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
|
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
Mask values selected in ``[0, 1]``:
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
Length of each sentence that can be used to avoid performing attention on padding token indices.
|
Length of each sentence that can be used to avoid performing attention on padding token indices.
|
||||||
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
|
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
|
||||||
@@ -488,7 +338,7 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
|
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
|
||||||
'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads',
|
'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads',
|
||||||
'hidden_dim', 'dropout', 'attention_dropout', 'asm',
|
'hidden_dim', 'dropout', 'attention_dropout', 'asm',
|
||||||
'asm_cutoffs', 'asm_div_value']
|
'asm_cutoffs', 'asm_div_value']
|
||||||
|
|
||||||
@@ -507,6 +357,7 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
|
|
||||||
# dictionary / languages
|
# dictionary / languages
|
||||||
self.n_langs = config.n_langs
|
self.n_langs = config.n_langs
|
||||||
|
self.use_lang_emb = config.use_lang_emb
|
||||||
self.n_words = config.n_words
|
self.n_words = config.n_words
|
||||||
self.eos_index = config.eos_index
|
self.eos_index = config.eos_index
|
||||||
self.pad_index = config.pad_index
|
self.pad_index = config.pad_index
|
||||||
@@ -529,7 +380,7 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
|
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
|
||||||
if config.sinusoidal_embeddings:
|
if config.sinusoidal_embeddings:
|
||||||
create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
|
create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
|
||||||
if config.n_langs > 1:
|
if config.n_langs > 1 and config.use_lang_emb:
|
||||||
self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
|
self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
|
||||||
self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
|
self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
|
||||||
self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
|
self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
|
||||||
@@ -552,7 +403,14 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
|
self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
|
||||||
self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
|
self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
if hasattr(config, "pruned_heads"):
|
||||||
|
pruned_heads = config.pruned_heads.copy().items()
|
||||||
|
config.pruned_heads = {}
|
||||||
|
for layer, heads in pruned_heads:
|
||||||
|
if self.attentions[int(layer)].n_heads == config.n_heads:
|
||||||
|
self.prune_heads({int(layer): list(map(int, heads))})
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
|
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
|
||||||
@@ -566,8 +424,8 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.attentions[layer].prune_heads(heads)
|
self.attentions[layer].prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
token_type_ids=None, attention_mask=None, cache=None, head_mask=None): # src_enc=None, src_len=None,
|
lengths=None, cache=None, head_mask=None): # removed: src_enc=None, src_len=None
|
||||||
if lengths is None:
|
if lengths is None:
|
||||||
lengths = (input_ids != self.pad_index).sum(dim=1).long()
|
lengths = (input_ids != self.pad_index).sum(dim=1).long()
|
||||||
# mask = input_ids != self.pad_index
|
# mask = input_ids != self.pad_index
|
||||||
@@ -628,7 +486,7 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
# embeddings
|
# embeddings
|
||||||
tensor = self.embeddings(input_ids)
|
tensor = self.embeddings(input_ids)
|
||||||
tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
|
tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
|
||||||
if langs is not None:
|
if langs is not None and self.use_lang_emb:
|
||||||
tensor = tensor + self.lang_embeddings(langs)
|
tensor = tensor + self.lang_embeddings(langs)
|
||||||
if token_type_ids is not None:
|
if token_type_ids is not None:
|
||||||
tensor = tensor + self.embeddings(token_type_ids)
|
tensor = tensor + self.embeddings(token_type_ids)
|
||||||
@@ -764,7 +622,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
self.pred_layer = XLMPredLayer(config)
|
self.pred_layer = XLMPredLayer(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -772,11 +630,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
|
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
attention_mask=None, cache=None, labels=None, head_mask=None):
|
lengths=None, cache=None, head_mask=None, labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
token_type_ids=token_type_ids, langs=langs,
|
attention_mask=attention_mask,
|
||||||
attention_mask=attention_mask, cache=cache, head_mask=head_mask)
|
langs=langs,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
lengths=lengths,
|
||||||
|
cache=cache,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
outputs = self.pred_layer(output, labels)
|
outputs = self.pred_layer(output, labels)
|
||||||
@@ -826,13 +689,18 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
|||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
self.sequence_summary = SequenceSummary(config)
|
self.sequence_summary = SequenceSummary(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
attention_mask=None, cache=None, labels=None, head_mask=None):
|
lengths=None, cache=None, head_mask=None, labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
token_type_ids=token_type_ids, langs=langs,
|
attention_mask=attention_mask,
|
||||||
attention_mask=attention_mask, cache=cache, head_mask=head_mask)
|
langs=langs,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
lengths=lengths,
|
||||||
|
cache=cache,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
logits = self.sequence_summary(output)
|
logits = self.sequence_summary(output)
|
||||||
@@ -904,14 +772,19 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
|||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
self.qa_outputs = SQuADHead(config)
|
self.qa_outputs = SQuADHead(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
attention_mask=None, cache=None, start_positions=None, end_positions=None,
|
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
|
||||||
cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
|
is_impossible=None, cls_index=None, p_mask=None):
|
||||||
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
token_type_ids=token_type_ids, langs=langs,
|
attention_mask=attention_mask,
|
||||||
attention_mask=attention_mask, cache=cache, head_mask=head_mask)
|
langs=langs,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
lengths=lengths,
|
||||||
|
cache=cache,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
|
|
||||||
|
|||||||
@@ -29,9 +29,9 @@ from torch import nn
|
|||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from torch.nn import CrossEntropyLoss, MSELoss
|
from torch.nn import CrossEntropyLoss, MSELoss
|
||||||
|
|
||||||
from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
|
from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits
|
||||||
SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits,
|
from .configuration_xlnet import XLNetConfig
|
||||||
add_start_docstrings)
|
from .file_utils import add_start_docstrings
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -40,10 +40,6 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
|
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
|
||||||
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
|
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
|
||||||
}
|
}
|
||||||
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
||||||
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
|
|
||||||
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
|
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
|
||||||
@@ -192,165 +188,11 @@ def swish(x):
|
|||||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
||||||
|
|
||||||
|
|
||||||
class XLNetConfig(PretrainedConfig):
|
|
||||||
"""Configuration class to store the configuration of a ``XLNetModel``.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
|
|
||||||
d_model: Size of the encoder layers and the pooler layer.
|
|
||||||
n_layer: Number of hidden layers in the Transformer encoder.
|
|
||||||
n_head: Number of attention heads for each attention layer in
|
|
||||||
the Transformer encoder.
|
|
||||||
d_inner: The size of the "intermediate" (i.e., feed-forward)
|
|
||||||
layer in the Transformer encoder.
|
|
||||||
ff_activation: The non-linear activation function (function or string) in the
|
|
||||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
|
||||||
untie_r: untie relative position biases
|
|
||||||
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
|
|
||||||
|
|
||||||
dropout: The dropout probabilitiy for all fully connected
|
|
||||||
layers in the embeddings, encoder, and pooler.
|
|
||||||
dropatt: The dropout ratio for the attention
|
|
||||||
probabilities.
|
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
|
||||||
initializing all weight matrices.
|
|
||||||
layer_norm_eps: The epsilon used by LayerNorm.
|
|
||||||
|
|
||||||
dropout: float, dropout rate.
|
|
||||||
dropatt: float, dropout rate on attention probabilities.
|
|
||||||
init: str, the initialization scheme, either "normal" or "uniform".
|
|
||||||
init_range: float, initialize the parameters with a uniform distribution
|
|
||||||
in [-init_range, init_range]. Only effective when init="uniform".
|
|
||||||
init_std: float, initialize the parameters with a normal distribution
|
|
||||||
with mean 0 and stddev init_std. Only effective when init="normal".
|
|
||||||
mem_len: int, the number of tokens to cache.
|
|
||||||
reuse_len: int, the number of tokens in the currect batch to be cached
|
|
||||||
and reused in the future.
|
|
||||||
bi_data: bool, whether to use bidirectional input pipeline.
|
|
||||||
Usually set to True during pretraining and False during finetuning.
|
|
||||||
clamp_len: int, clamp all relative distances larger than clamp_len.
|
|
||||||
-1 means no clamping.
|
|
||||||
same_length: bool, whether to use the same attention length for each token.
|
|
||||||
finetuning_task: name of the glue task on which the model was fine-tuned if any
|
|
||||||
"""
|
|
||||||
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
vocab_size_or_config_json_file=32000,
|
|
||||||
d_model=1024,
|
|
||||||
n_layer=24,
|
|
||||||
n_head=16,
|
|
||||||
d_inner=4096,
|
|
||||||
ff_activation="gelu",
|
|
||||||
untie_r=True,
|
|
||||||
attn_type="bi",
|
|
||||||
|
|
||||||
initializer_range=0.02,
|
|
||||||
layer_norm_eps=1e-12,
|
|
||||||
|
|
||||||
dropout=0.1,
|
|
||||||
mem_len=None,
|
|
||||||
reuse_len=None,
|
|
||||||
bi_data=False,
|
|
||||||
clamp_len=-1,
|
|
||||||
same_length=False,
|
|
||||||
|
|
||||||
finetuning_task=None,
|
|
||||||
num_labels=2,
|
|
||||||
summary_type='last',
|
|
||||||
summary_use_proj=True,
|
|
||||||
summary_activation='tanh',
|
|
||||||
summary_last_dropout=0.1,
|
|
||||||
start_n_top=5,
|
|
||||||
end_n_top=5,
|
|
||||||
**kwargs):
|
|
||||||
"""Constructs XLNetConfig.
|
|
||||||
"""
|
|
||||||
super(XLNetConfig, self).__init__(**kwargs)
|
|
||||||
|
|
||||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
|
||||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
|
||||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
|
||||||
json_config = json.loads(reader.read())
|
|
||||||
for key, value in json_config.items():
|
|
||||||
self.__dict__[key] = value
|
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
|
||||||
self.n_token = vocab_size_or_config_json_file
|
|
||||||
self.d_model = d_model
|
|
||||||
self.n_layer = n_layer
|
|
||||||
self.n_head = n_head
|
|
||||||
assert d_model % n_head == 0
|
|
||||||
self.d_head = d_model // n_head
|
|
||||||
self.ff_activation = ff_activation
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.attn_type = attn_type
|
|
||||||
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.layer_norm_eps = layer_norm_eps
|
|
||||||
|
|
||||||
self.dropout = dropout
|
|
||||||
self.mem_len = mem_len
|
|
||||||
self.reuse_len = reuse_len
|
|
||||||
self.bi_data = bi_data
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.same_length = same_length
|
|
||||||
|
|
||||||
self.finetuning_task = finetuning_task
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.summary_use_proj = summary_use_proj
|
|
||||||
self.summary_activation = summary_activation
|
|
||||||
self.summary_last_dropout = summary_last_dropout
|
|
||||||
self.start_n_top = start_n_top
|
|
||||||
self.end_n_top = end_n_top
|
|
||||||
else:
|
|
||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
|
||||||
" or the path to a pretrained model config file (str)")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def max_position_embeddings(self):
|
|
||||||
return -1
|
|
||||||
|
|
||||||
@property
|
|
||||||
def vocab_size(self):
|
|
||||||
return self.n_token
|
|
||||||
|
|
||||||
@vocab_size.setter
|
|
||||||
def vocab_size(self, value):
|
|
||||||
self.n_token = value
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hidden_size(self):
|
|
||||||
return self.d_model
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_attention_heads(self):
|
|
||||||
return self.n_head
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_hidden_layers(self):
|
|
||||||
return self.n_layer
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
|
from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
|
||||||
except (ImportError, AttributeError) as e:
|
except (ImportError, AttributeError) as e:
|
||||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
||||||
class XLNetLayerNorm(nn.Module):
|
from torch.nn import LayerNorm as XLNetLayerNorm
|
||||||
def __init__(self, d_model, eps=1e-12):
|
|
||||||
"""Construct a layernorm module in the TF style (epsilon inside the square root).
|
|
||||||
"""
|
|
||||||
super(XLNetLayerNorm, self).__init__()
|
|
||||||
self.weight = nn.Parameter(torch.ones(d_model))
|
|
||||||
self.bias = nn.Parameter(torch.zeros(d_model))
|
|
||||||
self.variance_epsilon = eps
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
u = x.mean(-1, keepdim=True)
|
|
||||||
s = (x - u).pow(2).mean(-1, keepdim=True)
|
|
||||||
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
|
|
||||||
return self.weight * x + self.bias
|
|
||||||
|
|
||||||
class XLNetRelativeAttention(nn.Module):
|
class XLNetRelativeAttention(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -418,7 +260,10 @@ class XLNetRelativeAttention(nn.Module):
|
|||||||
attn_score = (ac + bd + ef) * self.scale
|
attn_score = (ac + bd + ef) * self.scale
|
||||||
if attn_mask is not None:
|
if attn_mask is not None:
|
||||||
# attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
|
# attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
|
||||||
attn_score = attn_score - 1e30 * attn_mask
|
if attn_mask.dtype == torch.float16:
|
||||||
|
attn_score = attn_score - 65500 * attn_mask
|
||||||
|
else:
|
||||||
|
attn_score = attn_score - 1e30 * attn_mask
|
||||||
|
|
||||||
# attention probability
|
# attention probability
|
||||||
attn_prob = F.softmax(attn_score, dim=1)
|
attn_prob = F.softmax(attn_score, dim=1)
|
||||||
@@ -596,10 +441,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
|
|||||||
load_tf_weights = load_tf_weights_in_xlnet
|
load_tf_weights = load_tf_weights_in_xlnet
|
||||||
base_model_prefix = "transformer"
|
base_model_prefix = "transformer"
|
||||||
|
|
||||||
def __init__(self, *inputs, **kwargs):
|
def _init_weights(self, module):
|
||||||
super(XLNetPreTrainedModel, self).__init__(*inputs, **kwargs)
|
|
||||||
|
|
||||||
def init_weights(self, module):
|
|
||||||
""" Initialize the weights.
|
""" Initialize the weights.
|
||||||
"""
|
"""
|
||||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||||
@@ -662,19 +504,14 @@ XLNET_INPUTS_DOCSTRING = r"""
|
|||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
the important thing is that they should be different for tokens which belong to different segments.
|
||||||
|
The model will compute relative segment differences from the given type indices:
|
||||||
|
0 if the segment id of two tokens are the same, 1 if not.
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Mask to avoid performing attention on padding token indices.
|
|
||||||
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
|
|
||||||
Kept for compatibility with the original code base.
|
|
||||||
You can only uses one of `input_mask` and `attention_mask`
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
|
|
||||||
**mems**: (`optional`)
|
**mems**: (`optional`)
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
|
||||||
@@ -692,6 +529,17 @@ XLNET_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to indicate the output tokens to use.
|
Mask to indicate the output tokens to use.
|
||||||
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
|
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
|
||||||
Only used during pretraining for partial prediction or for sequential decoding (generation).
|
Only used during pretraining for partial prediction or for sequential decoding (generation).
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
|
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
|
||||||
|
Kept for compatibility with the original code base.
|
||||||
|
You can only uses one of `input_mask` and `attention_mask`
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -746,7 +594,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
||||||
self.dropout = nn.Dropout(config.dropout)
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def _resize_token_embeddings(self, new_num_tokens):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
|
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
|
||||||
@@ -850,8 +698,8 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
pos_emb = pos_emb.to(next(self.parameters()))
|
pos_emb = pos_emb.to(next(self.parameters()))
|
||||||
return pos_emb
|
return pos_emb
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None, head_mask=None):
|
token_type_ids=None, input_mask=None, head_mask=None):
|
||||||
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
|
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
|
||||||
# but we want a unified interface in the library with the batch size on the first dimension
|
# but we want a unified interface in the library with the batch size on the first dimension
|
||||||
# so we move here the first dimension (batch) to the end
|
# so we move here the first dimension (batch) to the end
|
||||||
@@ -1047,7 +895,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
@@ -1055,12 +903,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
|
self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None,
|
token_type_ids=None, input_mask=None, head_mask=None, labels=None):
|
||||||
labels=None, head_mask=None):
|
transformer_outputs = self.transformer(input_ids,
|
||||||
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
|
attention_mask=attention_mask,
|
||||||
input_mask=input_mask, attention_mask=attention_mask,
|
mems=mems,
|
||||||
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
|
perm_mask=perm_mask,
|
||||||
|
target_mapping=target_mapping,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
input_mask=input_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
|
|
||||||
logits = self.lm_loss(transformer_outputs[0])
|
logits = self.lm_loss(transformer_outputs[0])
|
||||||
@@ -1124,14 +975,17 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
|||||||
self.sequence_summary = SequenceSummary(config)
|
self.sequence_summary = SequenceSummary(config)
|
||||||
self.logits_proj = nn.Linear(config.d_model, config.num_labels)
|
self.logits_proj = nn.Linear(config.d_model, config.num_labels)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None,
|
token_type_ids=None, input_mask=None, head_mask=None, labels=None):
|
||||||
labels=None, head_mask=None):
|
transformer_outputs = self.transformer(input_ids,
|
||||||
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
|
attention_mask=attention_mask,
|
||||||
input_mask=input_mask, attention_mask=attention_mask,
|
mems=mems,
|
||||||
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
|
perm_mask=perm_mask,
|
||||||
|
target_mapping=target_mapping,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
input_mask=input_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
|
|
||||||
@@ -1270,15 +1124,18 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
|||||||
self.end_logits = PoolerEndLogits(config)
|
self.end_logits = PoolerEndLogits(config)
|
||||||
self.answer_class = PoolerAnswerClass(config)
|
self.answer_class = PoolerAnswerClass(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None,
|
token_type_ids=None, input_mask=None, head_mask=None,
|
||||||
start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
|
start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
|
||||||
head_mask=None):
|
transformer_outputs = self.transformer(input_ids,
|
||||||
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
|
attention_mask=attention_mask,
|
||||||
input_mask=input_mask, attention_mask=attention_mask,
|
mems=mems,
|
||||||
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
|
perm_mask=perm_mask,
|
||||||
|
target_mapping=target_mapping,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
input_mask=input_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
|
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
|
||||||
|
|||||||
63
pytorch_transformers/tests/configuration_common_test.py
Normal file
63
pytorch_transformers/tests/configuration_common_test.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2019 HuggingFace Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigTester(object):
|
||||||
|
def __init__(self, parent, config_class=None, **kwargs):
|
||||||
|
self.parent = parent
|
||||||
|
self.config_class = config_class
|
||||||
|
self.inputs_dict = kwargs
|
||||||
|
|
||||||
|
def create_and_test_config_common_properties(self):
|
||||||
|
config = self.config_class(**self.inputs_dict)
|
||||||
|
self.parent.assertTrue(hasattr(config, 'vocab_size'))
|
||||||
|
self.parent.assertTrue(hasattr(config, 'hidden_size'))
|
||||||
|
self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
|
||||||
|
self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
|
||||||
|
|
||||||
|
def create_and_test_config_to_json_string(self):
|
||||||
|
config = self.config_class(**self.inputs_dict)
|
||||||
|
obj = json.loads(config.to_json_string())
|
||||||
|
for key, value in self.inputs_dict.items():
|
||||||
|
self.parent.assertEqual(obj[key], value)
|
||||||
|
|
||||||
|
def create_and_test_config_to_json_file(self):
|
||||||
|
config_first = self.config_class(**self.inputs_dict)
|
||||||
|
json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
|
||||||
|
config_first.to_json_file(json_file_path)
|
||||||
|
config_second = self.config_class.from_json_file(json_file_path)
|
||||||
|
os.remove(json_file_path)
|
||||||
|
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
|
def run_common_tests(self):
|
||||||
|
self.create_and_test_config_common_properties()
|
||||||
|
self.create_and_test_config_to_json_string()
|
||||||
|
self.create_and_test_config_to_json_file()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -21,10 +21,15 @@ import shutil
|
|||||||
import pytest
|
import pytest
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
|
from pytorch_transformers import (AutoConfig, BertConfig,
|
||||||
|
AutoModel, BertModel,
|
||||||
|
AutoModelWithLMHead, BertForMaskedLM,
|
||||||
|
AutoModelForSequenceClassification, BertForSequenceClassification,
|
||||||
|
AutoModelForQuestionAnswering, BertForQuestionAnswering)
|
||||||
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
|
|
||||||
class AutoModelTest(unittest.TestCase):
|
class AutoModelTest(unittest.TestCase):
|
||||||
@@ -42,6 +47,42 @@ class AutoModelTest(unittest.TestCase):
|
|||||||
for value in loading_info.values():
|
for value in loading_info.values():
|
||||||
self.assertEqual(len(value), 0)
|
self.assertEqual(len(value), 0)
|
||||||
|
|
||||||
|
def test_lmhead_model_from_pretrained(self):
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
config = AutoConfig.from_pretrained(model_name)
|
||||||
|
self.assertIsNotNone(config)
|
||||||
|
self.assertIsInstance(config, BertConfig)
|
||||||
|
|
||||||
|
model = AutoModelWithLMHead.from_pretrained(model_name)
|
||||||
|
model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
self.assertIsInstance(model, BertForMaskedLM)
|
||||||
|
|
||||||
|
def test_sequence_classification_model_from_pretrained(self):
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
config = AutoConfig.from_pretrained(model_name)
|
||||||
|
self.assertIsNotNone(config)
|
||||||
|
self.assertIsInstance(config, BertConfig)
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
||||||
|
model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
self.assertIsInstance(model, BertForSequenceClassification)
|
||||||
|
|
||||||
|
def test_question_answering_model_from_pretrained(self):
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
config = AutoConfig.from_pretrained(model_name)
|
||||||
|
self.assertIsNotNone(config)
|
||||||
|
self.assertIsInstance(config, BertConfig)
|
||||||
|
|
||||||
|
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
||||||
|
model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
self.assertIsInstance(model, BertForQuestionAnswering)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -26,7 +26,8 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
|
|||||||
BertForTokenClassification, BertForMultipleChoice)
|
BertForTokenClassification, BertForMultipleChoice)
|
||||||
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
|
|
||||||
class BertModelTest(CommonTestCases.CommonModelTester):
|
class BertModelTest(CommonTestCases.CommonModelTester):
|
||||||
@@ -126,8 +127,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertModel(config=config)
|
model = BertModel(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids)
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids)
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -143,7 +144,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForMaskedLM(config=config)
|
model = BertForMaskedLM(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"prediction_scores": prediction_scores,
|
"prediction_scores": prediction_scores,
|
||||||
@@ -156,7 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForNextSentencePrediction(config=config)
|
model = BertForNextSentencePrediction(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
|
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"seq_relationship_score": seq_relationship_score,
|
"seq_relationship_score": seq_relationship_score,
|
||||||
@@ -170,7 +171,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForPreTraining(config=config)
|
model = BertForPreTraining(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
|
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
|
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"prediction_scores": prediction_scores,
|
"prediction_scores": prediction_scores,
|
||||||
@@ -188,7 +190,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForQuestionAnswering(config=config)
|
model = BertForQuestionAnswering(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
|
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"start_logits": start_logits,
|
"start_logits": start_logits,
|
||||||
@@ -207,7 +210,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = BertForSequenceClassification(config)
|
model = BertForSequenceClassification(config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
@@ -222,7 +225,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = BertForTokenClassification(config=config)
|
model = BertForTokenClassification(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
@@ -241,9 +244,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
loss, logits = model(multiple_choice_inputs_ids,
|
loss, logits = model(multiple_choice_inputs_ids,
|
||||||
multiple_choice_token_type_ids,
|
attention_mask=multiple_choice_input_mask,
|
||||||
multiple_choice_input_mask,
|
token_type_ids=multiple_choice_token_type_ids,
|
||||||
choice_labels)
|
labels=choice_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
|
|||||||
@@ -28,9 +28,9 @@ import logging
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_transformers import PretrainedConfig, PreTrainedModel
|
from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
|
||||||
from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
from pytorch_transformers.modeling_gpt2 import GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
|
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
|
||||||
def _config_zero_init(config):
|
def _config_zero_init(config):
|
||||||
@@ -163,7 +163,9 @@ class CommonTestCases:
|
|||||||
if not self.test_head_masking:
|
if not self.test_head_masking:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
global_rng.seed(42)
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
global_rng.seed()
|
||||||
|
|
||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = True
|
config.output_hidden_states = True
|
||||||
@@ -173,7 +175,7 @@ class CommonTestCases:
|
|||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
# Prepare head_mask
|
# Prepare head_mask
|
||||||
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
||||||
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
||||||
head_mask[0, 0] = 0
|
head_mask[0, 0] = 0
|
||||||
head_mask[-1, :-1] = 0
|
head_mask[-1, :-1] = 0
|
||||||
@@ -212,9 +214,12 @@ class CommonTestCases:
|
|||||||
if not self.test_pruning:
|
if not self.test_pruning:
|
||||||
return
|
return
|
||||||
|
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
||||||
|
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
if "head_mask" in inputs_dict:
|
||||||
|
del inputs_dict["head_mask"]
|
||||||
|
|
||||||
config.output_attentions = True
|
config.output_attentions = True
|
||||||
config.output_hidden_states = False
|
config.output_hidden_states = False
|
||||||
model = model_class(config=config)
|
model = model_class(config=config)
|
||||||
@@ -233,6 +238,120 @@ class CommonTestCases:
|
|||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||||
|
|
||||||
|
def test_head_pruning_save_load_from_pretrained(self):
|
||||||
|
if not self.test_pruning:
|
||||||
|
return
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
if "head_mask" in inputs_dict:
|
||||||
|
del inputs_dict["head_mask"]
|
||||||
|
|
||||||
|
config.output_attentions = True
|
||||||
|
config.output_hidden_states = False
|
||||||
|
model = model_class(config=config)
|
||||||
|
model.eval()
|
||||||
|
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||||
|
-1: [0]}
|
||||||
|
model.prune_heads(heads_to_prune)
|
||||||
|
directory = "pruned_model"
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
model.save_pretrained(directory)
|
||||||
|
model = model_class.from_pretrained(directory)
|
||||||
|
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
|
attentions = outputs[-1]
|
||||||
|
self.assertEqual(attentions[0].shape[-3], 1)
|
||||||
|
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
||||||
|
self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||||
|
|
||||||
|
shutil.rmtree(directory)
|
||||||
|
|
||||||
|
def test_head_pruning_save_load_from_config_init(self):
|
||||||
|
if not self.test_pruning:
|
||||||
|
return
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
if "head_mask" in inputs_dict:
|
||||||
|
del inputs_dict["head_mask"]
|
||||||
|
|
||||||
|
config.output_attentions = True
|
||||||
|
config.output_hidden_states = False
|
||||||
|
|
||||||
|
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||||
|
-1: [0]}
|
||||||
|
config.pruned_heads = heads_to_prune
|
||||||
|
|
||||||
|
model = model_class(config=config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
|
attentions = outputs[-1]
|
||||||
|
|
||||||
|
self.assertEqual(attentions[0].shape[-3], 1)
|
||||||
|
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
||||||
|
self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||||
|
|
||||||
|
def test_head_pruning_integration(self):
|
||||||
|
if not self.test_pruning:
|
||||||
|
return
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
|
||||||
|
if "head_mask" in inputs_dict:
|
||||||
|
del inputs_dict["head_mask"]
|
||||||
|
|
||||||
|
config.output_attentions = True
|
||||||
|
config.output_hidden_states = False
|
||||||
|
|
||||||
|
heads_to_prune = {0: [0], 1: [1, 2]}
|
||||||
|
config.pruned_heads = heads_to_prune
|
||||||
|
|
||||||
|
model = model_class(config=config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
|
attentions = outputs[-1]
|
||||||
|
|
||||||
|
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||||
|
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||||
|
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
|
||||||
|
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
|
||||||
|
|
||||||
|
directory = "pruned_model"
|
||||||
|
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
model.save_pretrained(directory)
|
||||||
|
model = model_class.from_pretrained(directory)
|
||||||
|
shutil.rmtree(directory)
|
||||||
|
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
|
attentions = outputs[-1]
|
||||||
|
|
||||||
|
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||||
|
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||||
|
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
|
||||||
|
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
|
||||||
|
|
||||||
|
heads_to_prune = {0: [0], 2: [1, 2]}
|
||||||
|
model.prune_heads(heads_to_prune)
|
||||||
|
|
||||||
|
outputs = model(**inputs_dict)
|
||||||
|
attentions = outputs[-1]
|
||||||
|
|
||||||
|
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
|
||||||
|
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||||
|
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
|
||||||
|
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
|
||||||
|
|
||||||
|
self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
|
||||||
|
|
||||||
|
|
||||||
def test_hidden_states_output(self):
|
def test_hidden_states_output(self):
|
||||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
@@ -547,12 +666,13 @@ class ConfigTester(object):
|
|||||||
self.create_and_test_config_to_json_file()
|
self.create_and_test_config_to_json_file()
|
||||||
|
|
||||||
|
|
||||||
|
global_rng = random.Random()
|
||||||
|
|
||||||
|
|
||||||
def ids_tensor(shape, vocab_size, rng=None, name=None):
|
def ids_tensor(shape, vocab_size, rng=None, name=None):
|
||||||
"""Creates a random int32 tensor of the shape within the vocab size."""
|
"""Creates a random int32 tensor of the shape within the vocab size."""
|
||||||
if rng is None:
|
if rng is None:
|
||||||
rng = random.Random()
|
rng = global_rng
|
||||||
|
|
||||||
total_dims = 1
|
total_dims = 1
|
||||||
for dim in shape:
|
for dim in shape:
|
||||||
|
|||||||
@@ -17,14 +17,12 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import shutil
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
|
from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
|
||||||
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
|
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
|
||||||
from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
|
|
||||||
class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||||
@@ -148,7 +146,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = DistilBertForQuestionAnswering(config=config)
|
model = DistilBertForQuestionAnswering(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels)
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"start_logits": start_logits,
|
"start_logits": start_logits,
|
||||||
@@ -166,7 +164,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = DistilBertForSequenceClassification(config)
|
model = DistilBertForSequenceClassification(config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, input_mask, sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
@@ -18,31 +18,197 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
import pytest
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
from pytorch_transformers import (GPT2Config, GPT2Model,
|
from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||||
|
|
||||||
from .modeling_common_test import CommonTestCases, ConfigTester
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
class GPT2ModelTest(unittest.TestCase):
|
|
||||||
|
class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
|
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||||
|
|
||||||
|
class GPT2ModelTester(object):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_token_type_ids=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=37,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=16,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
num_labels=3,
|
||||||
|
num_choices=4,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.seq_length = seq_length
|
||||||
|
self.is_training = is_training
|
||||||
|
self.use_token_type_ids = use_token_type_ids
|
||||||
|
self.use_labels = use_labels
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.type_vocab_size = type_vocab_size
|
||||||
|
self.type_sequence_label_size = type_sequence_label_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.num_choices = num_choices
|
||||||
|
self.scope = scope
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = GPT2Config(
|
||||||
|
vocab_size_or_config_json_file=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
|
||||||
|
def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = GPT2Model(config=config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
|
model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, presents = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
"presents": presents,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.hidden_size])
|
||||||
|
self.parent.assertEqual(len(result["presents"]), config.n_layer)
|
||||||
|
|
||||||
|
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = GPT2LMHeadModel(config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"lm_logits": lm_logits
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = GPT2DoubleHeadsModel(config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"lm_logits": lm_logits
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||||
|
inputs_dict = {
|
||||||
|
'input_ids': input_ids,
|
||||||
|
'token_type_ids': token_type_ids,
|
||||||
|
'head_mask': head_mask
|
||||||
|
}
|
||||||
|
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
self.config_tester.run_common_tests()
|
||||||
config_tester.run_common_tests()
|
|
||||||
|
|
||||||
def test_model(self):
|
def test_gpt2_model(self):
|
||||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
lm_head_model_class=GPT2LMHeadModel,
|
self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
|
||||||
double_head_model_class=GPT2DoubleHeadsModel)
|
|
||||||
model_tester.run_common_tests(test_presents=True)
|
def test_gpt2_lm_head_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_gpt2_double_lm_head_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
|
cache_dir = "/tmp/pytorch_transformers_test/"
|
||||||
lm_head_model_class=GPT2LMHeadModel,
|
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
double_head_model_class=GPT2DoubleHeadsModel)
|
model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
model_tester.run_slow_tests()
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -18,31 +18,195 @@ from __future__ import print_function
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
import pytest
|
import pytest
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
|
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||||
|
|
||||||
from .modeling_common_test import CommonTestCases, ConfigTester
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
class OpenAIModelTest(unittest.TestCase):
|
|
||||||
|
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
|
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||||
|
|
||||||
|
class OpenAIGPTModelTester(object):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_token_type_ids=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=37,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=16,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
num_labels=3,
|
||||||
|
num_choices=4,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.seq_length = seq_length
|
||||||
|
self.is_training = is_training
|
||||||
|
self.use_token_type_ids = use_token_type_ids
|
||||||
|
self.use_labels = use_labels
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.type_vocab_size = type_vocab_size
|
||||||
|
self.type_sequence_label_size = type_sequence_label_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.num_labels = num_labels
|
||||||
|
self.num_choices = num_choices
|
||||||
|
self.scope = scope
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = OpenAIGPTConfig(
|
||||||
|
vocab_size_or_config_json_file=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
|
||||||
|
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = OpenAIGPTModel(config=config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
|
model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
(sequence_output,) = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.hidden_size])
|
||||||
|
|
||||||
|
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = OpenAIGPTLMHeadModel(config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"lm_logits": lm_logits
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = OpenAIGPTDoubleHeadsModel(config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"lm_logits": lm_logits
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["loss"].size()),
|
||||||
|
[])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()),
|
||||||
|
[self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||||
|
inputs_dict = {
|
||||||
|
'input_ids': input_ids,
|
||||||
|
'token_type_ids': token_type_ids,
|
||||||
|
'head_mask': head_mask
|
||||||
|
}
|
||||||
|
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
|
||||||
|
self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
self.config_tester.run_common_tests()
|
||||||
config_tester.run_common_tests()
|
|
||||||
|
|
||||||
def test_model(self):
|
def test_openai_gpt_model(self):
|
||||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
lm_head_model_class=OpenAIGPTLMHeadModel,
|
self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
|
||||||
double_head_model_class=OpenAIGPTDoubleHeadsModel)
|
|
||||||
model_tester.run_common_tests(test_presents=False)
|
def test_openai_gpt_lm_head_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
||||||
|
|
||||||
|
def test_openai_gpt_double_lm_head_model(self):
|
||||||
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
|
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
|
cache_dir = "/tmp/pytorch_transformers_test/"
|
||||||
lm_head_model_class=OpenAIGPTLMHeadModel,
|
for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
double_head_model_class=OpenAIGPTDoubleHeadsModel)
|
model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
model_tester.run_slow_tests()
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -24,7 +24,8 @@ import torch
|
|||||||
from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
|
from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
|
||||||
from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
|
|
||||||
class RobertaModelTest(CommonTestCases.CommonModelTester):
|
class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||||
@@ -123,8 +124,8 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
token_labels, choice_labels):
|
token_labels, choice_labels):
|
||||||
model = RobertaModel(config=config)
|
model = RobertaModel(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids)
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids)
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -140,7 +141,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
|||||||
token_labels, choice_labels):
|
token_labels, choice_labels):
|
||||||
model = RobertaForMaskedLM(config=config)
|
model = RobertaForMaskedLM(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"prediction_scores": prediction_scores,
|
"prediction_scores": prediction_scores,
|
||||||
|
|||||||
@@ -16,9 +16,7 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import os
|
|
||||||
import unittest
|
import unittest
|
||||||
import json
|
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
import pytest
|
import pytest
|
||||||
@@ -28,7 +26,8 @@ import torch
|
|||||||
from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||||
from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,8 @@ import pytest
|
|||||||
from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
|
from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
|
||||||
from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
|
|
||||||
class XLMModelTest(CommonTestCases.CommonModelTester):
|
class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|||||||
@@ -28,7 +28,8 @@ import torch
|
|||||||
from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
|
from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
|
||||||
from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
|
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||||
|
from .configuration_common_test import ConfigTester
|
||||||
|
|
||||||
class XLNetModelTest(CommonTestCases.CommonModelTester):
|
class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||||
|
|
||||||
|
|||||||
@@ -41,8 +41,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname)
|
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
input_text = u"UNwant\u00E9d,running"
|
input_text = u"UNwant\u00E9d,running"
|
||||||
|
|||||||
@@ -27,8 +27,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
|
|||||||
|
|
||||||
tokenizer_class = DistilBertTokenizer
|
tokenizer_class = DistilBertTokenizer
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return DistilBertTokenizer.from_pretrained(self.tmpdirname)
|
return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import json
|
import json
|
||||||
|
from io import open
|
||||||
|
|
||||||
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
|
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
|
||||||
|
|
||||||
@@ -31,36 +32,38 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||||
"lo", "low", "er",
|
"\u0120", "\u0120l", "\u0120n",
|
||||||
"low", "lowest", "newer", "wider", "<unk>"]
|
"\u0120lo", "\u0120low", "er",
|
||||||
|
"\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||||
with open(self.vocab_file, "w") as fp:
|
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w") as fp:
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
|
kwargs.update(self.special_tokens_map)
|
||||||
|
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
input_text = u"lower newer"
|
input_text = u"lower newer"
|
||||||
output_text = u"lower<unk>newer"
|
output_text = u" lower newer"
|
||||||
return input_text, output_text
|
return input_text, output_text
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||||
text = "lower"
|
text = "lower newer"
|
||||||
bpe_tokens = ["low", "er"]
|
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
|
||||||
tokens = tokenizer.tokenize(text)
|
tokens = tokenizer.tokenize(text)
|
||||||
self.assertListEqual(tokens, bpe_tokens)
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
input_tokens = tokens + [tokenizer.unk_token]
|
input_tokens = tokens + [tokenizer.unk_token]
|
||||||
input_bpe_tokens = [13, 12, 17]
|
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -45,8 +45,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
with open(self.merges_file, "w") as fp:
|
with open(self.merges_file, "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
|
return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
input_text = u"lower newer"
|
input_text = u"lower newer"
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import unittest
|
import unittest
|
||||||
|
from io import open
|
||||||
|
|
||||||
from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
|
from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
|
||||||
from .tokenization_tests_commons import CommonTestCases
|
from .tokenization_tests_commons import CommonTestCases
|
||||||
@@ -30,36 +31,38 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||||
"lo", "low", "er",
|
"\u0120", "\u0120l", "\u0120n",
|
||||||
"low", "lowest", "newer", "wider", "<unk>"]
|
"\u0120lo", "\u0120low", "er",
|
||||||
|
"\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||||
with open(self.vocab_file, "w") as fp:
|
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w") as fp:
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
|
kwargs.update(self.special_tokens_map)
|
||||||
|
return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
input_text = u"lower newer"
|
input_text = u"lower newer"
|
||||||
output_text = u"lower<unk>newer"
|
output_text = u" lower newer"
|
||||||
return input_text, output_text
|
return input_text, output_text
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||||
text = "lower"
|
text = "lower newer"
|
||||||
bpe_tokens = ["low", "er"]
|
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
|
||||||
tokens = tokenizer.tokenize(text)
|
tokens = tokenizer.tokenize(text)
|
||||||
self.assertListEqual(tokens, bpe_tokens)
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
input_tokens = tokens + [tokenizer.unk_token]
|
input_tokens = tokens + [tokenizer.unk_token]
|
||||||
input_bpe_tokens = [13, 12, 17]
|
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -49,23 +49,48 @@ class CommonTestCases:
|
|||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(self.tmpdirname)
|
shutil.rmtree(self.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def test_save_and_load_tokenizer(self):
|
def test_tokenizers_common_properties(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
|
||||||
|
"pad_token", "cls_token", "mask_token"]
|
||||||
|
for attr in attributes_list:
|
||||||
|
self.assertTrue(hasattr(tokenizer, attr))
|
||||||
|
self.assertTrue(hasattr(tokenizer, attr + "_id"))
|
||||||
|
|
||||||
|
self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
|
||||||
|
self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
|
||||||
|
|
||||||
|
attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
|
||||||
|
"added_tokens_decoder"]
|
||||||
|
for attr in attributes_list:
|
||||||
|
self.assertTrue(hasattr(tokenizer, attr))
|
||||||
|
|
||||||
|
def test_save_and_load_tokenizer(self):
|
||||||
|
# safety check on max_len default value so we are sure the test works
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
self.assertNotEqual(tokenizer.max_len, 42)
|
||||||
|
|
||||||
|
# Now let's start the test
|
||||||
|
tokenizer = self.get_tokenizer(max_len=42)
|
||||||
|
|
||||||
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||||
|
|
||||||
with TemporaryDirectory() as tmpdirname:
|
with TemporaryDirectory() as tmpdirname:
|
||||||
tokenizer.save_pretrained(tmpdirname)
|
tokenizer.save_pretrained(tmpdirname)
|
||||||
tokenizer = tokenizer.from_pretrained(tmpdirname)
|
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||||
self.assertListEqual(before_tokens, after_tokens)
|
self.assertListEqual(before_tokens, after_tokens)
|
||||||
|
|
||||||
|
self.assertEqual(tokenizer.max_len, 42)
|
||||||
|
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
|
||||||
|
self.assertEqual(tokenizer.max_len, 43)
|
||||||
|
|
||||||
def test_pickle_tokenizer(self):
|
def test_pickle_tokenizer(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -95,7 +120,7 @@ class CommonTestCases:
|
|||||||
self.assertNotEqual(vocab_size, 0)
|
self.assertNotEqual(vocab_size, 0)
|
||||||
self.assertEqual(vocab_size, all_size)
|
self.assertEqual(vocab_size, all_size)
|
||||||
|
|
||||||
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
|
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
|
||||||
added_toks = tokenizer.add_tokens(new_toks)
|
added_toks = tokenizer.add_tokens(new_toks)
|
||||||
vocab_size_2 = tokenizer.vocab_size
|
vocab_size_2 = tokenizer.vocab_size
|
||||||
all_size_2 = len(tokenizer)
|
all_size_2 = len(tokenizer)
|
||||||
@@ -105,13 +130,15 @@ class CommonTestCases:
|
|||||||
self.assertEqual(added_toks, len(new_toks))
|
self.assertEqual(added_toks, len(new_toks))
|
||||||
self.assertEqual(all_size_2, all_size + len(new_toks))
|
self.assertEqual(all_size_2, all_size + len(new_toks))
|
||||||
|
|
||||||
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
|
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
|
||||||
|
out_string = tokenizer.decode(tokens)
|
||||||
|
|
||||||
self.assertGreaterEqual(len(tokens), 4)
|
self.assertGreaterEqual(len(tokens), 4)
|
||||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||||
|
|
||||||
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
|
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
|
||||||
'pad_token': "<<<<<|||>|>>>>|>"}
|
'pad_token': "<<<<<|||>|>>>>|>"}
|
||||||
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
||||||
vocab_size_3 = tokenizer.vocab_size
|
vocab_size_3 = tokenizer.vocab_size
|
||||||
all_size_3 = len(tokenizer)
|
all_size_3 = len(tokenizer)
|
||||||
@@ -122,14 +149,15 @@ class CommonTestCases:
|
|||||||
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
||||||
|
|
||||||
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
|
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
|
||||||
|
out_string = tokenizer.decode(tokens)
|
||||||
|
|
||||||
self.assertGreaterEqual(len(tokens), 6)
|
self.assertGreaterEqual(len(tokens), 6)
|
||||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||||
self.assertGreater(tokens[0], tokens[1])
|
self.assertGreater(tokens[0], tokens[1])
|
||||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||||
self.assertGreater(tokens[-2], tokens[-3])
|
self.assertGreater(tokens[-2], tokens[-3])
|
||||||
self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
|
self.assertEqual(tokens[0], tokenizer.eos_token_id)
|
||||||
self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
|
self.assertEqual(tokens[-2], tokenizer.pad_token_id)
|
||||||
|
|
||||||
|
|
||||||
def test_required_methods_tokenizer(self):
|
def test_required_methods_tokenizer(self):
|
||||||
|
|||||||
@@ -37,8 +37,9 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
|
kwargs['lower_case'] = True
|
||||||
|
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
input_text = u"<unk> UNwanted , running"
|
input_text = u"<unk> UNwanted , running"
|
||||||
|
|||||||
@@ -44,8 +44,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
with open(self.merges_file, "w") as fp:
|
with open(self.merges_file, "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return XLMTokenizer.from_pretrained(self.tmpdirname)
|
return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
input_text = u"lower newer"
|
input_text = u"lower newer"
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizer(self):
|
def get_tokenizer(self, **kwargs):
|
||||||
return XLNetTokenizer.from_pretrained(self.tmpdirname)
|
return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self):
|
def get_input_output_texts(self):
|
||||||
input_text = u"This is a test"
|
input_text = u"This is a test"
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ from .tokenization_transfo_xl import TransfoXLTokenizer
|
|||||||
from .tokenization_xlnet import XLNetTokenizer
|
from .tokenization_xlnet import XLNetTokenizer
|
||||||
from .tokenization_xlm import XLMTokenizer
|
from .tokenization_xlm import XLMTokenizer
|
||||||
from .tokenization_roberta import RobertaTokenizer
|
from .tokenization_roberta import RobertaTokenizer
|
||||||
|
from .tokenization_distilbert import DistilBertTokenizer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -39,13 +40,14 @@ class AutoTokenizer(object):
|
|||||||
|
|
||||||
The tokenizer class to instantiate is selected as the first pattern matching
|
The tokenizer class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
|
||||||
|
- contains `roberta`: RobertaTokenizer (RoBERTa model)
|
||||||
- contains `bert`: BertTokenizer (Bert model)
|
- contains `bert`: BertTokenizer (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
||||||
- contains `xlm`: XLMTokenizer (XLM model)
|
- contains `xlm`: XLMTokenizer (XLM model)
|
||||||
- contains `roberta`: RobertaTokenizer (RoBERTa model)
|
|
||||||
|
|
||||||
This class cannot be instantiated using `__init__()` (throw an error).
|
This class cannot be instantiated using `__init__()` (throw an error).
|
||||||
"""
|
"""
|
||||||
@@ -60,32 +62,45 @@ class AutoTokenizer(object):
|
|||||||
|
|
||||||
The tokenizer class to instantiate is selected as the first pattern matching
|
The tokenizer class to instantiate is selected as the first pattern matching
|
||||||
in the `pretrained_model_name_or_path` string (in the following order):
|
in the `pretrained_model_name_or_path` string (in the following order):
|
||||||
|
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
|
||||||
|
- contains `roberta`: RobertaTokenizer (XLM model)
|
||||||
- contains `bert`: BertTokenizer (Bert model)
|
- contains `bert`: BertTokenizer (Bert model)
|
||||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||||
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
||||||
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
||||||
- contains `xlm`: XLMTokenizer (XLM model)
|
- contains `xlm`: XLMTokenizer (XLM model)
|
||||||
- contains `roberta`: RobertaTokenizer (XLM model)
|
|
||||||
|
|
||||||
Params:
|
Params:
|
||||||
**pretrained_model_name_or_path**: either:
|
pretrained_model_name_or_path: either:
|
||||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
|
|
||||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||||
- a path to a `directory` containing a configuration file saved
|
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||||
using the `save_pretrained(save_directory)` method.
|
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
|
||||||
- a path or url to a saved configuration `file`.
|
|
||||||
**cache_dir**: (`optional`) string:
|
cache_dir: (`optional`) string:
|
||||||
Path to a directory in which a downloaded pre-trained model
|
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
|
||||||
configuration should be cached if the standard cache should not be used.
|
|
||||||
|
force_download: (`optional`) boolean, default False:
|
||||||
|
Force to (re-)download the vocabulary files and override the cached versions if they exists.
|
||||||
|
|
||||||
|
proxies: (`optional`) dict, default None:
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||||
|
The proxies are used on each request.
|
||||||
|
|
||||||
|
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
|
||||||
|
|
||||||
|
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
config = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
|
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||||
config = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if 'roberta' in pretrained_model_name_or_path:
|
if 'distilbert' in pretrained_model_name_or_path:
|
||||||
|
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||||
|
elif 'roberta' in pretrained_model_name_or_path:
|
||||||
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||||
elif 'bert' in pretrained_model_name_or_path:
|
elif 'bert' in pretrained_model_name_or_path:
|
||||||
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||||
|
|||||||
@@ -63,6 +63,23 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
'bert-base-cased-finetuned-mrpc': 512,
|
'bert-base-cased-finetuned-mrpc': 512,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PRETRAINED_INIT_CONFIGURATION = {
|
||||||
|
'bert-base-uncased': {'do_lower_case': True},
|
||||||
|
'bert-large-uncased': {'do_lower_case': True},
|
||||||
|
'bert-base-cased': {'do_lower_case': False},
|
||||||
|
'bert-large-cased': {'do_lower_case': False},
|
||||||
|
'bert-base-multilingual-uncased': {'do_lower_case': True},
|
||||||
|
'bert-base-multilingual-cased': {'do_lower_case': False},
|
||||||
|
'bert-base-chinese': {'do_lower_case': False},
|
||||||
|
'bert-base-german-cased': {'do_lower_case': False},
|
||||||
|
'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
|
||||||
|
'bert-large-cased-whole-word-masking': {'do_lower_case': False},
|
||||||
|
'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
|
||||||
|
'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
|
||||||
|
'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(vocab_file):
|
def load_vocab(vocab_file):
|
||||||
"""Loads a vocabulary file into a dictionary."""
|
"""Loads a vocabulary file into a dictionary."""
|
||||||
vocab = collections.OrderedDict()
|
vocab = collections.OrderedDict()
|
||||||
@@ -100,6 +117,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
|
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
|
||||||
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
|
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
|
||||||
@@ -174,15 +192,15 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
Adds special tokens to the a sequence for sequence classification tasks.
|
Adds special tokens to the a sequence for sequence classification tasks.
|
||||||
A BERT sequence has the following format: [CLS] X [SEP]
|
A BERT sequence has the following format: [CLS] X [SEP]
|
||||||
"""
|
"""
|
||||||
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
|
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
||||||
|
|
||||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||||
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
||||||
"""
|
"""
|
||||||
sep = [self._convert_token_to_id(self.sep_token)]
|
sep = [self.sep_token_id]
|
||||||
cls = [self._convert_token_to_id(self.cls_token)]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
@@ -202,24 +220,6 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
index += 1
|
index += 1
|
||||||
return (vocab_file,)
|
return (vocab_file,)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
|
||||||
""" Instantiate a BertTokenizer from pre-trained vocabulary files.
|
|
||||||
"""
|
|
||||||
if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
|
|
||||||
if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
|
|
||||||
logger.warning("The pre-trained model you are loading is a cased model but you have not set "
|
|
||||||
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
|
|
||||||
"you may want to check this behavior.")
|
|
||||||
kwargs['do_lower_case'] = False
|
|
||||||
elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
|
|
||||||
logger.warning("The pre-trained model you are loading is an uncased model but you have set "
|
|
||||||
"`do_lower_case` to False. We are setting `do_lower_case=True` for you "
|
|
||||||
"but you may want to check this behavior.")
|
|
||||||
kwargs['do_lower_case'] = True
|
|
||||||
|
|
||||||
return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class BasicTokenizer(object):
|
class BasicTokenizer(object):
|
||||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
||||||
|
|||||||
@@ -64,13 +64,14 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def bytes_to_unicode():
|
def bytes_to_unicode():
|
||||||
"""
|
"""
|
||||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
Returns list of utf-8 byte and a mapping to unicode strings.
|
||||||
|
We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
|
|
||||||
The reversible bpe codes work on unicode strings.
|
The reversible bpe codes work on unicode strings.
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
|
||||||
"""
|
"""
|
||||||
_chr = unichr if sys.version_info[0] == 2 else chr
|
_chr = unichr if sys.version_info[0] == 2 else chr
|
||||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||||
@@ -99,7 +100,10 @@ def get_pairs(word):
|
|||||||
class GPT2Tokenizer(PreTrainedTokenizer):
|
class GPT2Tokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
GPT-2 BPE tokenizer. Peculiarities:
|
GPT-2 BPE tokenizer. Peculiarities:
|
||||||
- Byte-level BPE
|
- Byte-level Byte-Pair-Encoding
|
||||||
|
- Requires a space to start the input string => will add a space is there isn't.
|
||||||
|
As a consequence, this tokenizer `encode` and `decode` method will not conserve
|
||||||
|
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||||
"""
|
"""
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
@@ -111,11 +115,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||||
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||||
|
|
||||||
self.encoder = json.load(open(vocab_file))
|
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||||
self.errors = errors # how to handle errors in decoding
|
self.errors = errors # how to handle errors in decoding
|
||||||
self.byte_encoder = bytes_to_unicode()
|
self.byte_encoder = bytes_to_unicode()
|
||||||
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
|
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||||
@@ -171,12 +175,13 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def _tokenize(self, text):
|
def _tokenize(self, text):
|
||||||
""" Tokenize a string. """
|
""" Tokenize a string. """
|
||||||
|
text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
|
||||||
bpe_tokens = []
|
bpe_tokens = []
|
||||||
for token in re.findall(self.pat, text):
|
for token in re.findall(self.pat, text):
|
||||||
if sys.version_info[0] == 2:
|
if sys.version_info[0] == 2:
|
||||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||||
else:
|
else:
|
||||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||||
return bpe_tokens
|
return bpe_tokens
|
||||||
|
|
||||||
@@ -216,4 +221,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||||
index += 1
|
index += 1
|
||||||
|
|
||||||
return vocab_file, merge_file
|
return vocab_file, merge_file
|
||||||
@@ -23,8 +23,7 @@ import os
|
|||||||
import regex as re
|
import regex as re
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
from .tokenization_gpt2 import bytes_to_unicode, get_pairs
|
from .tokenization_gpt2 import GPT2Tokenizer
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
@@ -63,9 +62,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class RobertaTokenizer(PreTrainedTokenizer):
|
class RobertaTokenizer(GPT2Tokenizer):
|
||||||
"""
|
"""
|
||||||
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
|
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
|
||||||
|
- Byte-level Byte-Pair-Encoding
|
||||||
|
- Requires a space to start the input string => will add a space is there isn't.
|
||||||
|
As a consequence, this tokenizer `encode` and `decode` method will not conserve
|
||||||
|
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||||
"""
|
"""
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
@@ -73,132 +76,23 @@ class RobertaTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
|
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
|
||||||
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
|
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
|
||||||
super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
|
super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
|
||||||
|
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
|
||||||
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
|
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
|
||||||
mask_token=mask_token, **kwargs)
|
mask_token=mask_token, **kwargs)
|
||||||
|
|
||||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
|
||||||
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
|
||||||
|
|
||||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
|
||||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
|
||||||
self.errors = errors # how to handle errors in decoding
|
|
||||||
self.byte_encoder = bytes_to_unicode()
|
|
||||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
|
||||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
|
||||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
|
||||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
|
||||||
self.cache = {}
|
|
||||||
|
|
||||||
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
|
||||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def vocab_size(self):
|
|
||||||
return len(self.encoder)
|
|
||||||
|
|
||||||
def bpe(self, token):
|
|
||||||
if token in self.cache:
|
|
||||||
return self.cache[token]
|
|
||||||
word = tuple(token)
|
|
||||||
pairs = get_pairs(word)
|
|
||||||
|
|
||||||
if not pairs:
|
|
||||||
return token
|
|
||||||
|
|
||||||
while True:
|
|
||||||
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
|
||||||
if bigram not in self.bpe_ranks:
|
|
||||||
break
|
|
||||||
first, second = bigram
|
|
||||||
new_word = []
|
|
||||||
i = 0
|
|
||||||
while i < len(word):
|
|
||||||
try:
|
|
||||||
j = word.index(first, i)
|
|
||||||
new_word.extend(word[i:j])
|
|
||||||
i = j
|
|
||||||
except:
|
|
||||||
new_word.extend(word[i:])
|
|
||||||
break
|
|
||||||
|
|
||||||
if word[i] == first and i < len(word)-1 and word[i+1] == second:
|
|
||||||
new_word.append(first+second)
|
|
||||||
i += 2
|
|
||||||
else:
|
|
||||||
new_word.append(word[i])
|
|
||||||
i += 1
|
|
||||||
new_word = tuple(new_word)
|
|
||||||
word = new_word
|
|
||||||
if len(word) == 1:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
pairs = get_pairs(word)
|
|
||||||
word = ' '.join(word)
|
|
||||||
self.cache[token] = word
|
|
||||||
return word
|
|
||||||
|
|
||||||
def _tokenize(self, text):
|
|
||||||
""" Tokenize a string. """
|
|
||||||
bpe_tokens = []
|
|
||||||
for token in re.findall(self.pat, text):
|
|
||||||
if sys.version_info[0] == 2:
|
|
||||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
|
||||||
else:
|
|
||||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
|
||||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
|
||||||
return bpe_tokens
|
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
|
||||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
|
||||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
|
||||||
|
|
||||||
def _convert_id_to_token(self, index):
|
|
||||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
|
||||||
return self.decoder.get(index)
|
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
|
||||||
""" Converts a sequence of tokens (string) in a single string. """
|
|
||||||
text = ''.join(tokens)
|
|
||||||
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
|
|
||||||
return text
|
|
||||||
|
|
||||||
def add_special_tokens_single_sentence(self, token_ids):
|
def add_special_tokens_single_sentence(self, token_ids):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Adds special tokens to a sequence for sequence classification tasks.
|
||||||
A RoBERTa sequence has the following format: <s> X </s>
|
A RoBERTa sequence has the following format: <s> X </s>
|
||||||
"""
|
"""
|
||||||
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
|
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
||||||
|
|
||||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||||
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
|
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
|
||||||
"""
|
"""
|
||||||
sep = [self._convert_token_to_id(self.sep_token)]
|
sep = [self.sep_token_id]
|
||||||
cls = [self._convert_token_to_id(self.cls_token)]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
|
||||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
|
||||||
if not os.path.isdir(save_directory):
|
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
|
||||||
return
|
|
||||||
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
|
|
||||||
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
|
|
||||||
|
|
||||||
with open(vocab_file, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
|
||||||
|
|
||||||
index = 0
|
|
||||||
with open(merge_file, "w", encoding="utf-8") as writer:
|
|
||||||
writer.write(u'#version: 0.2\n')
|
|
||||||
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
|
||||||
if index != token_index:
|
|
||||||
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
|
|
||||||
" Please check that the tokenizer is not corrupted!".format(merge_file))
|
|
||||||
index = token_index
|
|
||||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
|
||||||
index += 1
|
|
||||||
|
|
||||||
return vocab_file, merge_file
|
|
||||||
|
|||||||
@@ -95,7 +95,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
|||||||
# in a library like ours, at all.
|
# in a library like ours, at all.
|
||||||
vocab_dict = torch.load(pretrained_vocab_file)
|
vocab_dict = torch.load(pretrained_vocab_file)
|
||||||
for key, value in vocab_dict.items():
|
for key, value in vocab_dict.items():
|
||||||
self.__dict__[key] = value
|
if key not in self.__dict__:
|
||||||
|
self.__dict__[key] = value
|
||||||
|
|
||||||
if vocab_file is not None:
|
if vocab_file is not None:
|
||||||
self.build_vocab()
|
self.build_vocab()
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import six
|
import six
|
||||||
|
import copy
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
from .file_utils import cached_path
|
from .file_utils import cached_path
|
||||||
@@ -28,6 +29,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
|
SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
|
||||||
ADDED_TOKENS_FILE = 'added_tokens.json'
|
ADDED_TOKENS_FILE = 'added_tokens.json'
|
||||||
|
TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
|
||||||
|
|
||||||
class PreTrainedTokenizer(object):
|
class PreTrainedTokenizer(object):
|
||||||
""" Base class for all tokenizers.
|
""" Base class for all tokenizers.
|
||||||
@@ -40,27 +42,29 @@ class PreTrainedTokenizer(object):
|
|||||||
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
|
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
|
||||||
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
|
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
|
||||||
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
|
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
|
||||||
|
- ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
|
|
||||||
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
|
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
|
||||||
|
|
||||||
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
|
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
|
||||||
|
|
||||||
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
|
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
|
||||||
|
|
||||||
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
|
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
|
||||||
|
|
||||||
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
|
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
|
||||||
|
|
||||||
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
|
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
|
||||||
|
|
||||||
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
|
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
|
||||||
|
|
||||||
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
|
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
|
||||||
"""
|
"""
|
||||||
vocab_files_names = {}
|
vocab_files_names = {}
|
||||||
pretrained_vocab_files_map = {}
|
pretrained_vocab_files_map = {}
|
||||||
|
pretrained_init_configuration = {}
|
||||||
max_model_input_sizes = {}
|
max_model_input_sizes = {}
|
||||||
|
|
||||||
SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
|
SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
|
||||||
@@ -155,6 +159,46 @@ class PreTrainedTokenizer(object):
|
|||||||
def additional_special_tokens(self, value):
|
def additional_special_tokens(self, value):
|
||||||
self._additional_special_tokens = value
|
self._additional_special_tokens = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bos_token_id(self):
|
||||||
|
""" Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.bos_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eos_token_id(self):
|
||||||
|
""" Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.eos_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def unk_token_id(self):
|
||||||
|
""" Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.unk_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sep_token_id(self):
|
||||||
|
""" Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.sep_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pad_token_id(self):
|
||||||
|
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.pad_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cls_token_id(self):
|
||||||
|
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.cls_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mask_token_id(self):
|
||||||
|
""" Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.mask_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def additional_special_tokens_ids(self):
|
||||||
|
""" Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
|
||||||
|
return self.convert_tokens_to_ids(self.additional_special_tokens)
|
||||||
|
|
||||||
def __init__(self, max_len=None, **kwargs):
|
def __init__(self, max_len=None, **kwargs):
|
||||||
self._bos_token = None
|
self._bos_token = None
|
||||||
self._eos_token = None
|
self._eos_token = None
|
||||||
@@ -166,12 +210,15 @@ class PreTrainedTokenizer(object):
|
|||||||
self._additional_special_tokens = []
|
self._additional_special_tokens = []
|
||||||
|
|
||||||
self.max_len = max_len if max_len is not None else int(1e12)
|
self.max_len = max_len if max_len is not None else int(1e12)
|
||||||
self.max_len_single_sentence = self.max_len
|
|
||||||
self.max_len_sentences_pair = self.max_len
|
|
||||||
|
|
||||||
|
# Added tokens
|
||||||
self.added_tokens_encoder = {}
|
self.added_tokens_encoder = {}
|
||||||
self.added_tokens_decoder = {}
|
self.added_tokens_decoder = {}
|
||||||
|
|
||||||
|
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
|
||||||
|
self.init_inputs = ()
|
||||||
|
self.init_kwargs = {}
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
||||||
if key == 'additional_special_tokens':
|
if key == 'additional_special_tokens':
|
||||||
@@ -231,17 +278,20 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
|
||||||
cache_dir = kwargs.pop('cache_dir', None)
|
cache_dir = kwargs.pop('cache_dir', None)
|
||||||
force_download = kwargs.pop('force_download', False)
|
force_download = kwargs.pop('force_download', False)
|
||||||
proxies = kwargs.pop('proxies', None)
|
proxies = kwargs.pop('proxies', None)
|
||||||
|
|
||||||
s3_models = list(cls.max_model_input_sizes.keys())
|
s3_models = list(cls.max_model_input_sizes.keys())
|
||||||
vocab_files = {}
|
vocab_files = {}
|
||||||
|
init_configuration = {}
|
||||||
if pretrained_model_name_or_path in s3_models:
|
if pretrained_model_name_or_path in s3_models:
|
||||||
# Get the vocabulary from AWS S3 bucket
|
# Get the vocabulary from AWS S3 bucket
|
||||||
for file_id, map_list in cls.pretrained_vocab_files_map.items():
|
for file_id, map_list in cls.pretrained_vocab_files_map.items():
|
||||||
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
|
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
|
||||||
|
if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
|
||||||
|
init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
|
||||||
else:
|
else:
|
||||||
# Get the vocabulary from local files
|
# Get the vocabulary from local files
|
||||||
logger.info(
|
logger.info(
|
||||||
@@ -264,15 +314,17 @@ class PreTrainedTokenizer(object):
|
|||||||
vocab_files[file_id] = full_file_name
|
vocab_files[file_id] = full_file_name
|
||||||
|
|
||||||
# Look for the additional tokens files
|
# Look for the additional tokens files
|
||||||
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
|
additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
|
||||||
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
|
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
|
||||||
|
'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
|
||||||
|
}
|
||||||
|
|
||||||
# If a path to a file was provided, get the parent directory
|
# If a path to a file was provided, get the parent directory
|
||||||
saved_directory = pretrained_model_name_or_path
|
saved_directory = pretrained_model_name_or_path
|
||||||
if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
|
if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
|
||||||
saved_directory = os.path.dirname(saved_directory)
|
saved_directory = os.path.dirname(saved_directory)
|
||||||
|
|
||||||
for file_id, file_name in all_vocab_files_names.items():
|
for file_id, file_name in additional_files_names.items():
|
||||||
full_file_name = os.path.join(saved_directory, file_name)
|
full_file_name = os.path.join(saved_directory, file_name)
|
||||||
if not os.path.exists(full_file_name):
|
if not os.path.exists(full_file_name):
|
||||||
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
|
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
|
||||||
@@ -315,28 +367,46 @@ class PreTrainedTokenizer(object):
|
|||||||
logger.info("loading file {} from cache at {}".format(
|
logger.info("loading file {} from cache at {}".format(
|
||||||
file_path, resolved_vocab_files[file_id]))
|
file_path, resolved_vocab_files[file_id]))
|
||||||
|
|
||||||
|
# Prepare tokenizer initialization kwargs
|
||||||
|
# Did we saved some inputs and kwargs to reload ?
|
||||||
|
tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
|
||||||
|
if tokenizer_config_file is not None:
|
||||||
|
init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
|
||||||
|
saved_init_inputs = init_kwargs.pop('init_inputs', ())
|
||||||
|
if not init_inputs:
|
||||||
|
init_inputs = saved_init_inputs
|
||||||
|
else:
|
||||||
|
init_kwargs = init_configuration
|
||||||
|
|
||||||
|
# Update with newly provided kwargs
|
||||||
|
init_kwargs.update(kwargs)
|
||||||
|
|
||||||
# Set max length if needed
|
# Set max length if needed
|
||||||
if pretrained_model_name_or_path in cls.max_model_input_sizes:
|
if pretrained_model_name_or_path in cls.max_model_input_sizes:
|
||||||
# if we're using a pretrained model, ensure the tokenizer
|
# if we're using a pretrained model, ensure the tokenizer
|
||||||
# wont index sequences longer than the number of positional embeddings
|
# wont index sequences longer than the number of positional embeddings
|
||||||
max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
|
max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
|
||||||
if max_len is not None and isinstance(max_len, (int, float)):
|
if max_len is not None and isinstance(max_len, (int, float)):
|
||||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
|
||||||
|
|
||||||
# Merge resolved_vocab_files arguments in kwargs.
|
# Merge resolved_vocab_files arguments in init_kwargs.
|
||||||
added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
|
added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
|
||||||
special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
|
special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
|
||||||
for args_name, file_path in resolved_vocab_files.items():
|
for args_name, file_path in resolved_vocab_files.items():
|
||||||
if args_name not in kwargs:
|
if args_name not in init_kwargs:
|
||||||
kwargs[args_name] = file_path
|
init_kwargs[args_name] = file_path
|
||||||
if special_tokens_map_file is not None:
|
if special_tokens_map_file is not None:
|
||||||
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
|
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
|
||||||
for key, value in special_tokens_map.items():
|
for key, value in special_tokens_map.items():
|
||||||
if key not in kwargs:
|
if key not in init_kwargs:
|
||||||
kwargs[key] = value
|
init_kwargs[key] = value
|
||||||
|
|
||||||
# Instantiate tokenizer.
|
# Instantiate tokenizer.
|
||||||
tokenizer = cls(*inputs, **kwargs)
|
tokenizer = cls(*init_inputs, **init_kwargs)
|
||||||
|
|
||||||
|
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
|
||||||
|
tokenizer.init_inputs = init_inputs
|
||||||
|
tokenizer.init_kwargs = init_kwargs
|
||||||
|
|
||||||
# Add supplementary tokens.
|
# Add supplementary tokens.
|
||||||
if added_tokens_file is not None:
|
if added_tokens_file is not None:
|
||||||
@@ -349,8 +419,13 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
""" Save the tokenizer vocabulary files (with added tokens) and the
|
""" Save the tokenizer vocabulary files together with:
|
||||||
special-tokens-to-class-attributes-mapping to a directory.
|
- added tokens,
|
||||||
|
- special-tokens-to-class-attributes-mapping,
|
||||||
|
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
|
||||||
|
|
||||||
|
This won't save modifications other than (added tokens and special token mapping) you may have
|
||||||
|
applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
|
||||||
|
|
||||||
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
|
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
|
||||||
"""
|
"""
|
||||||
@@ -360,6 +435,15 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
|
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
|
||||||
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
|
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
|
||||||
|
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
|
||||||
|
|
||||||
|
tokenizer_config = copy.deepcopy(self.init_kwargs)
|
||||||
|
tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
|
||||||
|
for file_id in self.vocab_files_names.keys():
|
||||||
|
tokenizer_config.pop(file_id, None)
|
||||||
|
|
||||||
|
with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
|
||||||
|
|
||||||
with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
|
with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
|
||||||
f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
|
f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
|
||||||
@@ -441,6 +525,13 @@ class PreTrainedTokenizer(object):
|
|||||||
to class attributes. If special tokens are NOT in the vocabulary, they are added
|
to class attributes. If special tokens are NOT in the vocabulary, they are added
|
||||||
to it (indexed starting from the last index of the current vocabulary).
|
to it (indexed starting from the last index of the current vocabulary).
|
||||||
|
|
||||||
|
Using `add_special_tokens` will ensure your special tokens can be used in several ways:
|
||||||
|
|
||||||
|
- special tokens are carefully handled by the tokenizer (they are never split)
|
||||||
|
- you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
|
||||||
|
|
||||||
|
When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
|
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
|
||||||
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
|
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
|
||||||
@@ -546,6 +637,9 @@ class PreTrainedTokenizer(object):
|
|||||||
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
|
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
|
||||||
(resp. a sequence of ids), using the vocabulary.
|
(resp. a sequence of ids), using the vocabulary.
|
||||||
"""
|
"""
|
||||||
|
if tokens is None:
|
||||||
|
return None
|
||||||
|
|
||||||
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
|
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
|
||||||
return self._convert_token_to_id_with_added_voc(tokens)
|
return self._convert_token_to_id_with_added_voc(tokens)
|
||||||
|
|
||||||
@@ -559,6 +653,9 @@ class PreTrainedTokenizer(object):
|
|||||||
return ids
|
return ids
|
||||||
|
|
||||||
def _convert_token_to_id_with_added_voc(self, token):
|
def _convert_token_to_id_with_added_voc(self, token):
|
||||||
|
if token is None:
|
||||||
|
return None
|
||||||
|
|
||||||
if token in self.added_tokens_encoder:
|
if token in self.added_tokens_encoder:
|
||||||
return self.added_tokens_encoder[token]
|
return self.added_tokens_encoder[token]
|
||||||
return self._convert_token_to_id(token)
|
return self._convert_token_to_id(token)
|
||||||
@@ -566,7 +663,7 @@ class PreTrainedTokenizer(object):
|
|||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def encode(self, text, text_pair=None, add_special_tokens=False):
|
def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
|
||||||
"""
|
"""
|
||||||
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
|
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
|
||||||
|
|
||||||
@@ -577,15 +674,16 @@ class PreTrainedTokenizer(object):
|
|||||||
text_pair: Optional second sequence to be encoded.
|
text_pair: Optional second sequence to be encoded.
|
||||||
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
|
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
|
||||||
to their model.
|
to their model.
|
||||||
|
**kwargs: passed to the `self.tokenize()` method
|
||||||
"""
|
"""
|
||||||
if text_pair is None:
|
if text_pair is None:
|
||||||
if add_special_tokens:
|
if add_special_tokens:
|
||||||
return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
|
return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
|
||||||
else:
|
else:
|
||||||
return self.convert_tokens_to_ids(self.tokenize(text))
|
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
|
||||||
|
|
||||||
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
|
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
|
||||||
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
|
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
|
||||||
|
|
||||||
if add_special_tokens:
|
if add_special_tokens:
|
||||||
return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
|
return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
|
||||||
@@ -614,7 +712,7 @@ class PreTrainedTokenizer(object):
|
|||||||
return self._convert_id_to_token(ids)
|
return self._convert_id_to_token(ids)
|
||||||
tokens = []
|
tokens = []
|
||||||
for index in ids:
|
for index in ids:
|
||||||
if index in self.all_special_ids and skip_special_tokens:
|
if skip_special_tokens and index in self.all_special_ids:
|
||||||
continue
|
continue
|
||||||
if index in self.added_tokens_decoder:
|
if index in self.added_tokens_decoder:
|
||||||
tokens.append(self.added_tokens_decoder[index])
|
tokens.append(self.added_tokens_decoder[index])
|
||||||
@@ -639,7 +737,25 @@ class PreTrainedTokenizer(object):
|
|||||||
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
||||||
"""
|
"""
|
||||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||||
text = self.convert_tokens_to_string(filtered_tokens)
|
|
||||||
|
# To avoid mixing byte-level and unicode for byte-level BPT
|
||||||
|
# we need to build string separatly for added tokens and byte-level tokens
|
||||||
|
# cf. https://github.com/huggingface/pytorch-transformers/issues/1133
|
||||||
|
sub_texts = []
|
||||||
|
current_sub_text = []
|
||||||
|
for token in filtered_tokens:
|
||||||
|
if skip_special_tokens and token in self.all_special_ids:
|
||||||
|
continue
|
||||||
|
if token in self.added_tokens_encoder:
|
||||||
|
if current_sub_text:
|
||||||
|
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||||
|
current_sub_text = []
|
||||||
|
sub_texts.append(" " + token)
|
||||||
|
else:
|
||||||
|
current_sub_text.append(token)
|
||||||
|
if current_sub_text:
|
||||||
|
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||||
|
text = ''.join(sub_texts)
|
||||||
|
|
||||||
if self._sep_token is not None and self._sep_token in text:
|
if self._sep_token is not None and self._sep_token in text:
|
||||||
text = text.replace(self._cls_token, self._sep_token)
|
text = text.replace(self._cls_token, self._sep_token)
|
||||||
@@ -676,7 +792,7 @@ class PreTrainedTokenizer(object):
|
|||||||
all_toks = []
|
all_toks = []
|
||||||
set_attr = self.special_tokens_map
|
set_attr = self.special_tokens_map
|
||||||
for attr_value in set_attr.values():
|
for attr_value in set_attr.values():
|
||||||
all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value])
|
all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
|
||||||
all_toks = list(set(all_toks))
|
all_toks = list(set(all_toks))
|
||||||
return all_toks
|
return all_toks
|
||||||
|
|
||||||
|
|||||||
@@ -20,8 +20,12 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
import unicodedata
|
||||||
from io import open
|
from io import open
|
||||||
|
|
||||||
|
import sacremoses as sm
|
||||||
|
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
from .tokenization_bert import BasicTokenizer
|
from .tokenization_bert import BasicTokenizer
|
||||||
|
|
||||||
@@ -43,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
|
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
|
||||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
|
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
|
||||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
|
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
|
||||||
|
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
|
||||||
|
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
|
||||||
},
|
},
|
||||||
'merges_file':
|
'merges_file':
|
||||||
{
|
{
|
||||||
@@ -54,6 +60,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||||||
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
|
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
|
||||||
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
|
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
|
||||||
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
|
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
|
||||||
|
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
|
||||||
|
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,6 +74,342 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
'xlm-mlm-xnli15-1024': 512,
|
'xlm-mlm-xnli15-1024': 512,
|
||||||
'xlm-clm-enfr-1024': 512,
|
'xlm-clm-enfr-1024': 512,
|
||||||
'xlm-clm-ende-1024': 512,
|
'xlm-clm-ende-1024': 512,
|
||||||
|
'xlm-mlm-17-1280': 512,
|
||||||
|
'xlm-mlm-100-1280': 512,
|
||||||
|
}
|
||||||
|
|
||||||
|
PRETRAINED_INIT_CONFIGURATION = {
|
||||||
|
'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
|
||||||
|
'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
|
||||||
|
"id2lang": { "0": "de",
|
||||||
|
"1": "en"},
|
||||||
|
"lang2id": { "de": 0,
|
||||||
|
"en": 1 }},
|
||||||
|
'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
|
||||||
|
"id2lang": { "0": "en",
|
||||||
|
"1": "fr"},
|
||||||
|
"lang2id": { "en": 0,
|
||||||
|
"fr": 1 }},
|
||||||
|
'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
|
||||||
|
"id2lang": { "0": "en",
|
||||||
|
"1": "ro"},
|
||||||
|
"lang2id": { "en": 0,
|
||||||
|
"ro": 1 }},
|
||||||
|
'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
|
||||||
|
"id2lang": { "0": "ar",
|
||||||
|
"1": "bg",
|
||||||
|
"2": "de",
|
||||||
|
"3": "el",
|
||||||
|
"4": "en",
|
||||||
|
"5": "es",
|
||||||
|
"6": "fr",
|
||||||
|
"7": "hi",
|
||||||
|
"8": "ru",
|
||||||
|
"9": "sw",
|
||||||
|
"10": "th",
|
||||||
|
"11": "tr",
|
||||||
|
"12": "ur",
|
||||||
|
"13": "vi",
|
||||||
|
"14": "zh"},
|
||||||
|
"lang2id": { "ar": 0,
|
||||||
|
"bg": 1,
|
||||||
|
"de": 2,
|
||||||
|
"el": 3,
|
||||||
|
"en": 4,
|
||||||
|
"es": 5,
|
||||||
|
"fr": 6,
|
||||||
|
"hi": 7,
|
||||||
|
"ru": 8,
|
||||||
|
"sw": 9,
|
||||||
|
"th": 10,
|
||||||
|
"tr": 11,
|
||||||
|
"ur": 12,
|
||||||
|
"vi": 13,
|
||||||
|
"zh": 14 }},
|
||||||
|
'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
|
||||||
|
"id2lang": { "0": "ar",
|
||||||
|
"1": "bg",
|
||||||
|
"2": "de",
|
||||||
|
"3": "el",
|
||||||
|
"4": "en",
|
||||||
|
"5": "es",
|
||||||
|
"6": "fr",
|
||||||
|
"7": "hi",
|
||||||
|
"8": "ru",
|
||||||
|
"9": "sw",
|
||||||
|
"10": "th",
|
||||||
|
"11": "tr",
|
||||||
|
"12": "ur",
|
||||||
|
"13": "vi",
|
||||||
|
"14": "zh"},
|
||||||
|
"lang2id": { "ar": 0,
|
||||||
|
"bg": 1,
|
||||||
|
"de": 2,
|
||||||
|
"el": 3,
|
||||||
|
"en": 4,
|
||||||
|
"es": 5,
|
||||||
|
"fr": 6,
|
||||||
|
"hi": 7,
|
||||||
|
"ru": 8,
|
||||||
|
"sw": 9,
|
||||||
|
"th": 10,
|
||||||
|
"tr": 11,
|
||||||
|
"ur": 12,
|
||||||
|
"vi": 13,
|
||||||
|
"zh": 14 }},
|
||||||
|
'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
|
||||||
|
"id2lang": { "0": "en",
|
||||||
|
"1": "fr"},
|
||||||
|
"lang2id": { "en": 0,
|
||||||
|
"fr": 1 }},
|
||||||
|
'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
|
||||||
|
"id2lang": { "0": "de",
|
||||||
|
"1": "en"},
|
||||||
|
"lang2id": { "de": 0,
|
||||||
|
"en": 1 }},
|
||||||
|
'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
|
||||||
|
"id2lang": {
|
||||||
|
"0": "ar",
|
||||||
|
"1": "de",
|
||||||
|
"2": "en",
|
||||||
|
"3": "es",
|
||||||
|
"4": "fr",
|
||||||
|
"5": "hi",
|
||||||
|
"6": "it",
|
||||||
|
"7": "ja",
|
||||||
|
"8": "ko",
|
||||||
|
"9": "nl",
|
||||||
|
"10": "pl",
|
||||||
|
"11": "pt",
|
||||||
|
"12": "ru",
|
||||||
|
"13": "sv",
|
||||||
|
"14": "tr",
|
||||||
|
"15": "vi",
|
||||||
|
"16": "zh"
|
||||||
|
},
|
||||||
|
"lang2id": {
|
||||||
|
"ar": 0,
|
||||||
|
"de": 1,
|
||||||
|
"en": 2,
|
||||||
|
"es": 3,
|
||||||
|
"fr": 4,
|
||||||
|
"hi": 5,
|
||||||
|
"it": 6,
|
||||||
|
"ja": 7,
|
||||||
|
"ko": 8,
|
||||||
|
"nl": 9,
|
||||||
|
"pl": 10,
|
||||||
|
"pt": 11,
|
||||||
|
"ru": 12,
|
||||||
|
"sv": 13,
|
||||||
|
"tr": 14,
|
||||||
|
"vi": 15,
|
||||||
|
"zh": 16}},
|
||||||
|
'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
|
||||||
|
"id2lang": {
|
||||||
|
"0": "af",
|
||||||
|
"1": "als",
|
||||||
|
"2": "am",
|
||||||
|
"3": "an",
|
||||||
|
"4": "ang",
|
||||||
|
"5": "ar",
|
||||||
|
"6": "arz",
|
||||||
|
"7": "ast",
|
||||||
|
"8": "az",
|
||||||
|
"9": "bar",
|
||||||
|
"10": "be",
|
||||||
|
"11": "bg",
|
||||||
|
"12": "bn",
|
||||||
|
"13": "br",
|
||||||
|
"14": "bs",
|
||||||
|
"15": "ca",
|
||||||
|
"16": "ceb",
|
||||||
|
"17": "ckb",
|
||||||
|
"18": "cs",
|
||||||
|
"19": "cy",
|
||||||
|
"20": "da",
|
||||||
|
"21": "de",
|
||||||
|
"22": "el",
|
||||||
|
"23": "en",
|
||||||
|
"24": "eo",
|
||||||
|
"25": "es",
|
||||||
|
"26": "et",
|
||||||
|
"27": "eu",
|
||||||
|
"28": "fa",
|
||||||
|
"29": "fi",
|
||||||
|
"30": "fr",
|
||||||
|
"31": "fy",
|
||||||
|
"32": "ga",
|
||||||
|
"33": "gan",
|
||||||
|
"34": "gl",
|
||||||
|
"35": "gu",
|
||||||
|
"36": "he",
|
||||||
|
"37": "hi",
|
||||||
|
"38": "hr",
|
||||||
|
"39": "hu",
|
||||||
|
"40": "hy",
|
||||||
|
"41": "ia",
|
||||||
|
"42": "id",
|
||||||
|
"43": "is",
|
||||||
|
"44": "it",
|
||||||
|
"45": "ja",
|
||||||
|
"46": "jv",
|
||||||
|
"47": "ka",
|
||||||
|
"48": "kk",
|
||||||
|
"49": "kn",
|
||||||
|
"50": "ko",
|
||||||
|
"51": "ku",
|
||||||
|
"52": "la",
|
||||||
|
"53": "lb",
|
||||||
|
"54": "lt",
|
||||||
|
"55": "lv",
|
||||||
|
"56": "mk",
|
||||||
|
"57": "ml",
|
||||||
|
"58": "mn",
|
||||||
|
"59": "mr",
|
||||||
|
"60": "ms",
|
||||||
|
"61": "my",
|
||||||
|
"62": "nds",
|
||||||
|
"63": "ne",
|
||||||
|
"64": "nl",
|
||||||
|
"65": "nn",
|
||||||
|
"66": "no",
|
||||||
|
"67": "oc",
|
||||||
|
"68": "pl",
|
||||||
|
"69": "pt",
|
||||||
|
"70": "ro",
|
||||||
|
"71": "ru",
|
||||||
|
"72": "scn",
|
||||||
|
"73": "sco",
|
||||||
|
"74": "sh",
|
||||||
|
"75": "si",
|
||||||
|
"76": "simple",
|
||||||
|
"77": "sk",
|
||||||
|
"78": "sl",
|
||||||
|
"79": "sq",
|
||||||
|
"80": "sr",
|
||||||
|
"81": "sv",
|
||||||
|
"82": "sw",
|
||||||
|
"83": "ta",
|
||||||
|
"84": "te",
|
||||||
|
"85": "th",
|
||||||
|
"86": "tl",
|
||||||
|
"87": "tr",
|
||||||
|
"88": "tt",
|
||||||
|
"89": "uk",
|
||||||
|
"90": "ur",
|
||||||
|
"91": "uz",
|
||||||
|
"92": "vi",
|
||||||
|
"93": "war",
|
||||||
|
"94": "wuu",
|
||||||
|
"95": "yi",
|
||||||
|
"96": "zh",
|
||||||
|
"97": "zh_classical",
|
||||||
|
"98": "zh_min_nan",
|
||||||
|
"99": "zh_yue"
|
||||||
|
},
|
||||||
|
"lang2id": {
|
||||||
|
"af": 0,
|
||||||
|
"als": 1,
|
||||||
|
"am": 2,
|
||||||
|
"an": 3,
|
||||||
|
"ang": 4,
|
||||||
|
"ar": 5,
|
||||||
|
"arz": 6,
|
||||||
|
"ast": 7,
|
||||||
|
"az": 8,
|
||||||
|
"bar": 9,
|
||||||
|
"be": 10,
|
||||||
|
"bg": 11,
|
||||||
|
"bn": 12,
|
||||||
|
"br": 13,
|
||||||
|
"bs": 14,
|
||||||
|
"ca": 15,
|
||||||
|
"ceb": 16,
|
||||||
|
"ckb": 17,
|
||||||
|
"cs": 18,
|
||||||
|
"cy": 19,
|
||||||
|
"da": 20,
|
||||||
|
"de": 21,
|
||||||
|
"el": 22,
|
||||||
|
"en": 23,
|
||||||
|
"eo": 24,
|
||||||
|
"es": 25,
|
||||||
|
"et": 26,
|
||||||
|
"eu": 27,
|
||||||
|
"fa": 28,
|
||||||
|
"fi": 29,
|
||||||
|
"fr": 30,
|
||||||
|
"fy": 31,
|
||||||
|
"ga": 32,
|
||||||
|
"gan": 33,
|
||||||
|
"gl": 34,
|
||||||
|
"gu": 35,
|
||||||
|
"he": 36,
|
||||||
|
"hi": 37,
|
||||||
|
"hr": 38,
|
||||||
|
"hu": 39,
|
||||||
|
"hy": 40,
|
||||||
|
"ia": 41,
|
||||||
|
"id": 42,
|
||||||
|
"is": 43,
|
||||||
|
"it": 44,
|
||||||
|
"ja": 45,
|
||||||
|
"jv": 46,
|
||||||
|
"ka": 47,
|
||||||
|
"kk": 48,
|
||||||
|
"kn": 49,
|
||||||
|
"ko": 50,
|
||||||
|
"ku": 51,
|
||||||
|
"la": 52,
|
||||||
|
"lb": 53,
|
||||||
|
"lt": 54,
|
||||||
|
"lv": 55,
|
||||||
|
"mk": 56,
|
||||||
|
"ml": 57,
|
||||||
|
"mn": 58,
|
||||||
|
"mr": 59,
|
||||||
|
"ms": 60,
|
||||||
|
"my": 61,
|
||||||
|
"nds": 62,
|
||||||
|
"ne": 63,
|
||||||
|
"nl": 64,
|
||||||
|
"nn": 65,
|
||||||
|
"no": 66,
|
||||||
|
"oc": 67,
|
||||||
|
"pl": 68,
|
||||||
|
"pt": 69,
|
||||||
|
"ro": 70,
|
||||||
|
"ru": 71,
|
||||||
|
"scn": 72,
|
||||||
|
"sco": 73,
|
||||||
|
"sh": 74,
|
||||||
|
"si": 75,
|
||||||
|
"simple": 76,
|
||||||
|
"sk": 77,
|
||||||
|
"sl": 78,
|
||||||
|
"sq": 79,
|
||||||
|
"sr": 80,
|
||||||
|
"sv": 81,
|
||||||
|
"sw": 82,
|
||||||
|
"ta": 83,
|
||||||
|
"te": 84,
|
||||||
|
"th": 85,
|
||||||
|
"tl": 86,
|
||||||
|
"tr": 87,
|
||||||
|
"tt": 88,
|
||||||
|
"uk": 89,
|
||||||
|
"ur": 90,
|
||||||
|
"uz": 91,
|
||||||
|
"vi": 92,
|
||||||
|
"war": 93,
|
||||||
|
"wuu": 94,
|
||||||
|
"yi": 95,
|
||||||
|
"zh": 96,
|
||||||
|
"zh_classical": 97,
|
||||||
|
"zh_min_nan": 98,
|
||||||
|
"zh_yue": 99
|
||||||
|
}},
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_pairs(word):
|
def get_pairs(word):
|
||||||
@@ -80,62 +424,145 @@ def get_pairs(word):
|
|||||||
prev_char = char
|
prev_char = char
|
||||||
return pairs
|
return pairs
|
||||||
|
|
||||||
def text_standardize(text):
|
|
||||||
|
def lowercase_and_remove_accent(text):
|
||||||
"""
|
"""
|
||||||
fixes some issues the spacy tokenizer had on books corpus
|
Lowercase and strips accents from a piece of text based on
|
||||||
also does some whitespace standardization
|
https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
|
||||||
"""
|
"""
|
||||||
text = text.replace('—', '-')
|
text = ' '.join(text)
|
||||||
text = text.replace('–', '-')
|
text = text.lower()
|
||||||
text = text.replace('―', '-')
|
text = unicodedata.normalize("NFD", text)
|
||||||
|
output = []
|
||||||
|
for char in text:
|
||||||
|
cat = unicodedata.category(char)
|
||||||
|
if cat == "Mn":
|
||||||
|
continue
|
||||||
|
output.append(char)
|
||||||
|
return "".join(output).lower().split(' ')
|
||||||
|
|
||||||
|
|
||||||
|
def replace_unicode_punct(text):
|
||||||
|
'''
|
||||||
|
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
|
||||||
|
'''
|
||||||
|
text = text.replace(',', ',')
|
||||||
|
text = re.sub(r'。\s*', '. ', text)
|
||||||
|
text = text.replace('、', ',')
|
||||||
|
text = text.replace('”', '"')
|
||||||
|
text = text.replace('“', '"')
|
||||||
|
text = text.replace('∶', ':')
|
||||||
|
text = text.replace(':', ':')
|
||||||
|
text = text.replace('?', '?')
|
||||||
|
text = text.replace('《', '"')
|
||||||
|
text = text.replace('》', '"')
|
||||||
|
text = text.replace(')', ')')
|
||||||
|
text = text.replace('!', '!')
|
||||||
|
text = text.replace('(', '(')
|
||||||
|
text = text.replace(';', ';')
|
||||||
|
text = text.replace('1', '"')
|
||||||
|
text = text.replace('」', '"')
|
||||||
|
text = text.replace('「', '"')
|
||||||
|
text = text.replace('0', '0')
|
||||||
|
text = text.replace('3', '3')
|
||||||
|
text = text.replace('2', '2')
|
||||||
|
text = text.replace('5', '5')
|
||||||
|
text = text.replace('6', '6')
|
||||||
|
text = text.replace('9', '9')
|
||||||
|
text = text.replace('7', '7')
|
||||||
|
text = text.replace('8', '8')
|
||||||
|
text = text.replace('4', '4')
|
||||||
|
text = re.sub(r'.\s*', '. ', text)
|
||||||
|
text = text.replace('~', '~')
|
||||||
|
text = text.replace('’', '\'')
|
||||||
text = text.replace('…', '...')
|
text = text.replace('…', '...')
|
||||||
text = text.replace('´', "'")
|
text = text.replace('━', '-')
|
||||||
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
|
text = text.replace('〈', '<')
|
||||||
text = re.sub(r'\s*\n\s*', ' \n ', text)
|
text = text.replace('〉', '>')
|
||||||
text = re.sub(r'[^\S\n]+', ' ', text)
|
text = text.replace('【', '[')
|
||||||
return text.strip()
|
text = text.replace('】', ']')
|
||||||
|
text = text.replace('%', '%')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def remove_non_printing_char(text):
|
||||||
|
'''
|
||||||
|
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
|
||||||
|
'''
|
||||||
|
output = []
|
||||||
|
for char in text:
|
||||||
|
cat = unicodedata.category(char)
|
||||||
|
if cat.startswith('C'):
|
||||||
|
continue
|
||||||
|
output.append(char)
|
||||||
|
return "".join(output)
|
||||||
|
|
||||||
|
|
||||||
|
def romanian_preprocessing(text):
|
||||||
|
'''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
|
||||||
|
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
|
||||||
|
text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
|
||||||
|
text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
|
||||||
|
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
|
||||||
|
text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
|
||||||
|
text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
|
||||||
|
text = text.replace("\u0102", "A").replace("\u0103", "a")
|
||||||
|
text = text.replace("\u00C2", "A").replace("\u00E2", "a")
|
||||||
|
text = text.replace("\u00CE", "I").replace("\u00EE", "i")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
class XLMTokenizer(PreTrainedTokenizer):
|
class XLMTokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
|
BPE tokenizer for XLM
|
||||||
|
|
||||||
- lower case all inputs
|
- Moses preprocessing & tokenization for most supported languages
|
||||||
|
|
||||||
- uses `SpaCy tokenizer <https://spacy.io/api/tokenizer/>`_ and \
|
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
|
||||||
`ftfy <https://ftfy.readthedocs.io/en/latest/>`_ for pre-BPE tokenization if they are installed, \
|
|
||||||
fallback to BERT's BasicTokenizer if not.
|
- (optionally) lower case & normalize all inputs text
|
||||||
|
|
||||||
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
||||||
(ex: "__classify__") to a vocabulary.
|
(ex: "__classify__") to a vocabulary
|
||||||
|
|
||||||
|
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
|
||||||
|
|
||||||
|
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
|
||||||
|
|
||||||
|
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
|
||||||
"""
|
"""
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
|
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
|
||||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
|
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
|
||||||
sep_token="</s>", pad_token="<pad>", cls_token="</s>",
|
sep_token="</s>", pad_token="<pad>", cls_token="</s>",
|
||||||
mask_token="<special1>", additional_special_tokens=["<special0>",
|
mask_token="<special1>", additional_special_tokens=["<special0>",
|
||||||
"<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
|
"<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
|
||||||
"<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
|
"<special6>", "<special7>", "<special8>", "<special9>"],
|
||||||
|
lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
|
||||||
|
**kwargs):
|
||||||
super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
|
super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
|
||||||
sep_token=sep_token, pad_token=pad_token,
|
sep_token=sep_token, pad_token=pad_token,
|
||||||
cls_token=cls_token, mask_token=mask_token,
|
cls_token=cls_token, mask_token=mask_token,
|
||||||
additional_special_tokens=additional_special_tokens,
|
additional_special_tokens=additional_special_tokens,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
# cache of sm.MosesPunctNormalizer instance
|
||||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
self.cache_moses_punct_normalizer = dict()
|
||||||
|
# cache of sm.MosesTokenizer instance
|
||||||
|
self.cache_moses_tokenizer = dict()
|
||||||
|
self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
|
||||||
|
# True for current supported model (v1.2.0), False for XLM-17 & 100
|
||||||
|
self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
|
||||||
|
self.lang2id = lang2id
|
||||||
|
self.id2lang = id2lang
|
||||||
|
if lang2id is not None and id2lang is not None:
|
||||||
|
assert len(lang2id) == len(id2lang)
|
||||||
|
|
||||||
try:
|
self.ja_word_tokenizer = None
|
||||||
import ftfy
|
self.zh_word_tokenizer = None
|
||||||
from spacy.lang.en import English
|
|
||||||
_nlp = English()
|
|
||||||
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
|
|
||||||
self.fix_text = ftfy.fix_text
|
|
||||||
except ImportError:
|
|
||||||
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
|
|
||||||
self.nlp = BasicTokenizer(do_lower_case=True)
|
|
||||||
self.fix_text = None
|
|
||||||
|
|
||||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||||
@@ -144,6 +571,43 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
|
||||||
|
def moses_punct_norm(self, text, lang):
|
||||||
|
if lang not in self.cache_moses_punct_normalizer:
|
||||||
|
punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
|
||||||
|
self.cache_moses_punct_normalizer[lang] = punct_normalizer
|
||||||
|
else:
|
||||||
|
punct_normalizer = self.cache_moses_punct_normalizer[lang]
|
||||||
|
return punct_normalizer.normalize(text)
|
||||||
|
|
||||||
|
def moses_tokenize(self, text, lang):
|
||||||
|
if lang not in self.cache_moses_tokenizer:
|
||||||
|
moses_tokenizer = sm.MosesTokenizer(lang=lang)
|
||||||
|
self.cache_moses_tokenizer[lang] = moses_tokenizer
|
||||||
|
else:
|
||||||
|
moses_tokenizer = self.cache_moses_tokenizer[lang]
|
||||||
|
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
|
||||||
|
|
||||||
|
def moses_pipeline(self, text, lang):
|
||||||
|
text = replace_unicode_punct(text)
|
||||||
|
text = self.moses_punct_norm(text, lang)
|
||||||
|
text = remove_non_printing_char(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def ja_tokenize(self, text):
|
||||||
|
if self.ja_word_tokenizer is None:
|
||||||
|
try:
|
||||||
|
import Mykytea
|
||||||
|
self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
|
||||||
|
except (AttributeError, ImportError) as e:
|
||||||
|
logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
|
||||||
|
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
|
||||||
|
logger.error("2. autoreconf -i")
|
||||||
|
logger.error("3. ./configure --prefix=$HOME/local")
|
||||||
|
logger.error("4. make && make install")
|
||||||
|
logger.error("5. pip install kytea")
|
||||||
|
raise e
|
||||||
|
return list(self.ja_word_tokenizer.getWS(text))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def vocab_size(self):
|
||||||
return len(self.encoder)
|
return len(self.encoder)
|
||||||
@@ -191,19 +655,90 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
self.cache[token] = word
|
self.cache[token] = word
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def _tokenize(self, text):
|
def _tokenize(self, text, lang='en', bypass_tokenizer=False):
|
||||||
""" Tokenize a string. """
|
"""
|
||||||
split_tokens = []
|
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
|
||||||
if self.fix_text is None:
|
|
||||||
# Using BERT's BasicTokenizer
|
Details of tokenization:
|
||||||
text = self.nlp.tokenize(text)
|
- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
|
||||||
for token in text:
|
- Install with `pip install sacremoses`
|
||||||
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
- [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
|
||||||
|
- Install with `pip install pythainlp`
|
||||||
|
- [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
|
||||||
|
- Install with the following steps:
|
||||||
|
```
|
||||||
|
git clone git@github.com:neubig/kytea.git && cd kytea
|
||||||
|
autoreconf -i
|
||||||
|
./configure --prefix=$HOME/local
|
||||||
|
make && make install
|
||||||
|
pip install kytea
|
||||||
|
```
|
||||||
|
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
|
||||||
|
- Install with `pip install jieba`
|
||||||
|
|
||||||
|
\* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
|
||||||
|
However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
|
||||||
|
Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
|
||||||
|
if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
|
||||||
|
[preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
|
||||||
|
and set `bypass_tokenizer=True` to bypass the tokenizer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
|
||||||
|
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tokens.
|
||||||
|
"""
|
||||||
|
if lang and self.lang2id and lang not in self.lang2id:
|
||||||
|
logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
|
||||||
|
if bypass_tokenizer:
|
||||||
|
text = text.split()
|
||||||
|
elif lang not in self.lang_with_custom_tokenizer:
|
||||||
|
text = self.moses_pipeline(text, lang=lang)
|
||||||
|
# TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
|
||||||
|
if lang == 'ro':
|
||||||
|
text = romanian_preprocessing(text)
|
||||||
|
text = self.moses_tokenize(text, lang=lang)
|
||||||
|
elif lang == 'th':
|
||||||
|
text = self.moses_pipeline(text, lang=lang)
|
||||||
|
try:
|
||||||
|
if 'pythainlp' not in sys.modules:
|
||||||
|
from pythainlp.tokenize import word_tokenize as th_word_tokenize
|
||||||
|
else:
|
||||||
|
th_word_tokenize = sys.modules['pythainlp'].word_tokenize
|
||||||
|
except (AttributeError, ImportError) as e:
|
||||||
|
logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
|
||||||
|
logger.error("1. pip install pythainlp")
|
||||||
|
raise e
|
||||||
|
text = th_word_tokenize(text)
|
||||||
|
elif lang == 'zh':
|
||||||
|
try:
|
||||||
|
if 'jieba' not in sys.modules:
|
||||||
|
import jieba
|
||||||
|
else:
|
||||||
|
jieba = sys.modules['jieba']
|
||||||
|
except (AttributeError, ImportError) as e:
|
||||||
|
logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
|
||||||
|
logger.error("1. pip install jieba")
|
||||||
|
raise e
|
||||||
|
text = ' '.join(jieba.cut(text))
|
||||||
|
text = self.moses_pipeline(text, lang=lang)
|
||||||
|
text = text.split()
|
||||||
|
elif lang == 'ja':
|
||||||
|
text = self.moses_pipeline(text, lang=lang)
|
||||||
|
text = self.ja_tokenize(text)
|
||||||
else:
|
else:
|
||||||
# Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
|
raise ValueError('It should not reach here')
|
||||||
text = self.nlp(text_standardize(self.fix_text(text)))
|
|
||||||
for token in text:
|
if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
|
||||||
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
|
text = lowercase_and_remove_accent(text)
|
||||||
|
|
||||||
|
split_tokens = []
|
||||||
|
for token in text:
|
||||||
|
if token:
|
||||||
|
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
||||||
|
|
||||||
return split_tokens
|
return split_tokens
|
||||||
|
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
@@ -224,15 +759,15 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Adds special tokens to a sequence for sequence classification tasks.
|
||||||
An XLM sequence has the following format: [CLS] X [SEP]
|
An XLM sequence has the following format: [CLS] X [SEP]
|
||||||
"""
|
"""
|
||||||
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
|
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
||||||
|
|
||||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||||
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
||||||
"""
|
"""
|
||||||
sep = [self._convert_token_to_id(self.sep_token)]
|
sep = [self.sep_token_id]
|
||||||
cls = [self._convert_token_to_id(self.cls_token)]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
|
||||||
def __init__(self, vocab_file, max_len=None,
|
def __init__(self, vocab_file,
|
||||||
do_lower_case=False, remove_space=True, keep_accents=False,
|
do_lower_case=False, remove_space=True, keep_accents=False,
|
||||||
bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
|
bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
|
||||||
pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
|
pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
|
||||||
@@ -186,8 +186,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
Adds special tokens to a sequence pair for sequence classification tasks.
|
||||||
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
|
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
|
||||||
"""
|
"""
|
||||||
sep = [self._convert_token_to_id(self.sep_token)]
|
sep = [self.sep_token_id]
|
||||||
cls = [self._convert_token_to_id(self.cls_token)]
|
cls = [self.cls_token_id]
|
||||||
return token_ids + sep + cls
|
return token_ids + sep + cls
|
||||||
|
|
||||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||||
@@ -195,8 +195,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Adds special tokens to a sequence for sequence classification tasks.
|
||||||
An XLNet sequence has the following format: X [SEP][CLS]
|
An XLNet sequence has the following format: X [SEP][CLS]
|
||||||
"""
|
"""
|
||||||
sep = [self._convert_token_to_id(self.sep_token)]
|
sep = [self.sep_token_id]
|
||||||
cls = [self._convert_token_to_id(self.cls_token)]
|
cls = [self.cls_token_id]
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
|
|||||||
@@ -9,4 +9,6 @@ requests
|
|||||||
# For OpenAI GPT
|
# For OpenAI GPT
|
||||||
regex
|
regex
|
||||||
# For XLNet
|
# For XLNet
|
||||||
sentencepiece
|
sentencepiece
|
||||||
|
# For XLM
|
||||||
|
sacremoses
|
||||||
5
setup.py
5
setup.py
@@ -38,7 +38,7 @@ from setuptools import find_packages, setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="pytorch_transformers",
|
name="pytorch_transformers",
|
||||||
version="1.1.0",
|
version="1.2.0",
|
||||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
|
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
|
||||||
author_email="thomas@huggingface.co",
|
author_email="thomas@huggingface.co",
|
||||||
description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
|
description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
|
||||||
@@ -55,7 +55,8 @@ setup(
|
|||||||
'requests',
|
'requests',
|
||||||
'tqdm',
|
'tqdm',
|
||||||
'regex',
|
'regex',
|
||||||
'sentencepiece'],
|
'sentencepiece',
|
||||||
|
'sacremoses'],
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
"pytorch_transformers=pytorch_transformers.__main__:main",
|
"pytorch_transformers=pytorch_transformers.__main__:main",
|
||||||
|
|||||||
Reference in New Issue
Block a user