Merge remote-tracking branch 'upstream/master'
This commit is contained in:
@@ -9,7 +9,7 @@ jobs:
|
|||||||
- run: sudo pip install --progress-bar off .
|
- run: sudo pip install --progress-bar off .
|
||||||
- run: sudo pip install pytest ftfy spacy
|
- run: sudo pip install pytest ftfy spacy
|
||||||
- run: sudo python -m spacy download en
|
- run: sudo python -m spacy download en
|
||||||
- run: python -m pytest -sv tests/
|
- run: python -m pytest -sv tests/ --runslow
|
||||||
build_py2:
|
build_py2:
|
||||||
working_directory: ~/pytorch-pretrained-BERT
|
working_directory: ~/pytorch-pretrained-BERT
|
||||||
docker:
|
docker:
|
||||||
@@ -20,7 +20,7 @@ jobs:
|
|||||||
- run: sudo pip install pytest spacy
|
- run: sudo pip install pytest spacy
|
||||||
- run: sudo pip install ftfy==4.4.3
|
- run: sudo pip install ftfy==4.4.3
|
||||||
- run: sudo python -m spacy download en
|
- run: sudo python -m spacy download en
|
||||||
- run: python -m pytest -sv tests/
|
- run: python -m pytest -sv tests/ --runslow
|
||||||
workflows:
|
workflows:
|
||||||
version: 2
|
version: 2
|
||||||
build_and_test:
|
build_and_test:
|
||||||
|
|||||||
17
.github/stale.yml
vendored
Normal file
17
.github/stale.yml
vendored
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# Number of days of inactivity before an issue becomes stale
|
||||||
|
daysUntilStale: 60
|
||||||
|
# Number of days of inactivity before a stale issue is closed
|
||||||
|
daysUntilClose: 7
|
||||||
|
# Issues with these labels will never be considered stale
|
||||||
|
exemptLabels:
|
||||||
|
- pinned
|
||||||
|
- security
|
||||||
|
# Label to use when marking an issue as stale
|
||||||
|
staleLabel: wontfix
|
||||||
|
# Comment to post when marking an issue as stale. Set to `false` to disable
|
||||||
|
markComment: >
|
||||||
|
This issue has been automatically marked as stale because it has not had
|
||||||
|
recent activity. It will be closed if no further activity occurs. Thank you
|
||||||
|
for your contributions.
|
||||||
|
# Comment to post when closing a stale issue. Set to `false` to disable
|
||||||
|
closeComment: false
|
||||||
139
README.md
139
README.md
@@ -131,6 +131,7 @@ This package comprises the following classes that can be imported in Python and
|
|||||||
- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
|
- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
|
||||||
- `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
|
- `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
|
||||||
- `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
|
- `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
|
||||||
|
- `GPT2Config` - Configuration class to store the configuration of a `GPT2Model` with utilities to read and write from JSON configuration files.
|
||||||
- `TransfoXLConfig` - Configuration class to store the configuration of a `TransfoXLModel` with utilities to read and write from JSON configuration files.
|
- `TransfoXLConfig` - Configuration class to store the configuration of a `TransfoXLModel` with utilities to read and write from JSON configuration files.
|
||||||
|
|
||||||
The repository further comprises:
|
The repository further comprises:
|
||||||
@@ -461,10 +462,12 @@ Here is a detailed documentation of the classes in the package and how to use th
|
|||||||
|
|
||||||
| Sub-section | Description |
|
| Sub-section | Description |
|
||||||
|-|-|
|
|-|-|
|
||||||
| [Loading Google AI's/OpenAI's pre-trained weights](#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance |
|
| [Loading pre-trained weights](#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance |
|
||||||
| [PyTorch models](#PyTorch-models) | API of the BERT, GPT, GPT-2 and Transformer-XL PyTorch model classes |
|
| [Serialization best-practices](#serialization-best-practices) | How to save and reload a fine-tuned model |
|
||||||
|
| [Configurations](#configurations) | API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL |
|
||||||
|
| [Models](#models) | API of the PyTorch model classes for BERT, GPT, GPT-2 and Transformer-XL |
|
||||||
| [Tokenizers](#tokenizers) | API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL|
|
| [Tokenizers](#tokenizers) | API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL|
|
||||||
| [Optimizers](#optimizerss) | API of the optimizers |
|
| [Optimizers](#optimizers) | API of the optimizers |
|
||||||
|
|
||||||
### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
|
### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
|
||||||
|
|
||||||
@@ -524,7 +527,101 @@ model = GPT2Model.from_pretrained('gpt2')
|
|||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### PyTorch models
|
### Serialization best-practices
|
||||||
|
|
||||||
|
This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
|
||||||
|
There are three types of files you need to save to be able to reload a fine-tuned model:
|
||||||
|
|
||||||
|
- the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices),
|
||||||
|
- the configuration file of the model which is saved as a JSON file, and
|
||||||
|
- the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
|
||||||
|
|
||||||
|
Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
|
||||||
|
|
||||||
|
output_dir = "./models/"
|
||||||
|
|
||||||
|
# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
|
||||||
|
|
||||||
|
# If we have a distributed model, save only the encapsulated model
|
||||||
|
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
|
||||||
|
model_to_save = model.module if hasattr(model, 'module') else model
|
||||||
|
|
||||||
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
|
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
|
||||||
|
output_config_file = os.path.join(output_dir, CONFIG_NAME)
|
||||||
|
|
||||||
|
torch.save(model_to_save.state_dict(), output_model_file)
|
||||||
|
model_to_save.config.to_json_file(output_config_file)
|
||||||
|
tokenizer.save_vocabulary(output_dir)
|
||||||
|
|
||||||
|
# Step 2: Re-load the saved model and vocabulary
|
||||||
|
|
||||||
|
# Example for a Bert model
|
||||||
|
model = BertForQuestionAnswering.from_pretrained(output_dir)
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case) # Add specific options if needed
|
||||||
|
# Example for a GPT model
|
||||||
|
model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
|
||||||
|
tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
|
||||||
|
```
|
||||||
|
|
||||||
|
Here is another way you can save and reload the model if you want to use specific paths for each type of files:
|
||||||
|
|
||||||
|
```python
|
||||||
|
output_model_file = "./models/my_own_model_file.bin"
|
||||||
|
output_config_file = "./models/my_own_config_file.bin"
|
||||||
|
output_vocab_file = "./models/my_own_vocab_file.bin"
|
||||||
|
|
||||||
|
# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
|
||||||
|
|
||||||
|
# If we have a distributed model, save only the encapsulated model
|
||||||
|
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
|
||||||
|
model_to_save = model.module if hasattr(model, 'module') else model
|
||||||
|
|
||||||
|
torch.save(model_to_save.state_dict(), output_model_file)
|
||||||
|
model_to_save.config.to_json_file(output_config_file)
|
||||||
|
tokenizer.save_vocabulary(output_vocab_file)
|
||||||
|
|
||||||
|
# Step 2: Re-load the saved model and vocabulary
|
||||||
|
|
||||||
|
# We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
|
||||||
|
# Here is how to do it in this situation:
|
||||||
|
|
||||||
|
# Example for a Bert model
|
||||||
|
config = BertConfig.from_json_file(output_config_file)
|
||||||
|
model = BertForQuestionAnswering(config)
|
||||||
|
state_dict = torch.load(output_model_file)
|
||||||
|
model.load_state_dict(state_dict)
|
||||||
|
tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
|
||||||
|
|
||||||
|
# Example for a GPT model
|
||||||
|
config = OpenAIGPTConfig.from_json_file(output_config_file)
|
||||||
|
model = OpenAIGPTDoubleHeadsModel(config)
|
||||||
|
state_dict = torch.load(output_model_file)
|
||||||
|
model.load_state_dict(state_dict)
|
||||||
|
tokenizer = OpenAIGPTTokenizer(output_vocab_file)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configurations
|
||||||
|
|
||||||
|
Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are:
|
||||||
|
|
||||||
|
- `BertConfig` for `BertModel` and BERT classes instances.
|
||||||
|
- `OpenAIGPTConfig` for `OpenAIGPTModel` and OpenAI GPT classes instances.
|
||||||
|
- `GPT2Config` for `GPT2Model` and OpenAI GPT-2 classes instances.
|
||||||
|
- `TransfoXLConfig` for `TransfoXLModel` and Transformer-XL classes instances.
|
||||||
|
|
||||||
|
These configuration classes contains a few utilities to load and save configurations:
|
||||||
|
|
||||||
|
- `from_dict(cls, json_object)`: A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
|
||||||
|
- `from_json_file(cls, json_file)`: A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
|
||||||
|
- `to_dict()`: Serializes an instance to a Python dictionary. Returns a dictionary.
|
||||||
|
- `to_json_string()`: Serializes an instance to a JSON string. Returns a string.
|
||||||
|
- `to_json_file(json_file_path)`: Save an instance to a json file.
|
||||||
|
|
||||||
|
### Models
|
||||||
|
|
||||||
#### 1. `BertModel`
|
#### 1. `BertModel`
|
||||||
|
|
||||||
@@ -796,8 +893,7 @@ This model *outputs*:
|
|||||||
- `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
|
- `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
|
||||||
- `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
|
- `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
|
||||||
|
|
||||||
|
### Tokenizers
|
||||||
### Tokenizers:
|
|
||||||
|
|
||||||
#### `BertTokenizer`
|
#### `BertTokenizer`
|
||||||
|
|
||||||
@@ -816,6 +912,7 @@ and three methods:
|
|||||||
- `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
|
- `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
|
||||||
- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
|
- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
|
||||||
- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
|
- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
|
||||||
|
- `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: `vocab_file_path`. The vocabulary can be reloaded with `BertTokenizer.from_pretrained('vocab_file_path')` or `BertTokenizer.from_pretrained('directory_path')`.
|
||||||
|
|
||||||
Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
|
Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
|
||||||
|
|
||||||
@@ -832,11 +929,13 @@ This class has four arguments:
|
|||||||
|
|
||||||
and five methods:
|
and five methods:
|
||||||
|
|
||||||
- `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
|
- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing BPE tokenization.
|
||||||
- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
|
- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
|
||||||
- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
|
- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
|
||||||
- `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
|
- `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
|
||||||
|
- `encode(text)`: convert a `str` in a list of `int` tokens by performing BPE encoding.
|
||||||
- `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
|
- `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
|
||||||
|
- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
|
||||||
|
|
||||||
Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
|
Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
|
||||||
|
|
||||||
@@ -844,6 +943,8 @@ Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch
|
|||||||
|
|
||||||
`TransfoXLTokenizer` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper ([Efficient softmax approximation for GPUs](http://arxiv.org/abs/1609.04309)) for more details.
|
`TransfoXLTokenizer` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper ([Efficient softmax approximation for GPUs](http://arxiv.org/abs/1609.04309)) for more details.
|
||||||
|
|
||||||
|
The API is similar to the API of `BertTokenizer` (see above).
|
||||||
|
|
||||||
Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
|
Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
|
||||||
|
|
||||||
#### `GPT2Tokenizer`
|
#### `GPT2Tokenizer`
|
||||||
@@ -858,13 +959,17 @@ This class has three arguments:
|
|||||||
|
|
||||||
and two methods:
|
and two methods:
|
||||||
|
|
||||||
|
- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing byte-level BPE.
|
||||||
|
- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
|
||||||
|
- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
|
||||||
|
- `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
|
||||||
- `encode(text)`: convert a `str` in a list of `int` tokens by performing byte-level BPE.
|
- `encode(text)`: convert a `str` in a list of `int` tokens by performing byte-level BPE.
|
||||||
- `decode(tokens)`: convert back a list of `int` tokens in a `str`.
|
- `decode(tokens)`: convert back a list of `int` tokens in a `str`.
|
||||||
|
- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
|
||||||
|
|
||||||
Please refer to [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
|
Please refer to [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
|
||||||
|
|
||||||
|
### Optimizers
|
||||||
### Optimizers:
|
|
||||||
|
|
||||||
#### `BertAdam`
|
#### `BertAdam`
|
||||||
|
|
||||||
@@ -1174,18 +1279,20 @@ To get these results we used a combination of:
|
|||||||
|
|
||||||
Here is the full list of hyper-parameters for this run:
|
Here is the full list of hyper-parameters for this run:
|
||||||
```bash
|
```bash
|
||||||
|
export SQUAD_DIR=/path/to/SQUAD
|
||||||
|
|
||||||
python ./run_squad.py \
|
python ./run_squad.py \
|
||||||
--bert_model bert-large-uncased \
|
--bert_model bert-large-uncased \
|
||||||
--do_train \
|
--do_train \
|
||||||
--do_predict \
|
--do_predict \
|
||||||
--do_lower_case \
|
--do_lower_case \
|
||||||
--train_file $SQUAD_TRAIN \
|
--train_file $SQUAD_DIR/train-v1.1.json \
|
||||||
--predict_file $SQUAD_EVAL \
|
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
||||||
--learning_rate 3e-5 \
|
--learning_rate 3e-5 \
|
||||||
--num_train_epochs 2 \
|
--num_train_epochs 2 \
|
||||||
--max_seq_length 384 \
|
--max_seq_length 384 \
|
||||||
--doc_stride 128 \
|
--doc_stride 128 \
|
||||||
--output_dir $OUTPUT_DIR \
|
--output_dir /tmp/debug_squad/ \
|
||||||
--train_batch_size 24 \
|
--train_batch_size 24 \
|
||||||
--gradient_accumulation_steps 2
|
--gradient_accumulation_steps 2
|
||||||
```
|
```
|
||||||
@@ -1194,18 +1301,20 @@ If you have a recent GPU (starting from NVIDIA Volta series), you should try **1
|
|||||||
|
|
||||||
Here is an example of hyper-parameters for a FP16 run we tried:
|
Here is an example of hyper-parameters for a FP16 run we tried:
|
||||||
```bash
|
```bash
|
||||||
|
export SQUAD_DIR=/path/to/SQUAD
|
||||||
|
|
||||||
python ./run_squad.py \
|
python ./run_squad.py \
|
||||||
--bert_model bert-large-uncased \
|
--bert_model bert-large-uncased \
|
||||||
--do_train \
|
--do_train \
|
||||||
--do_predict \
|
--do_predict \
|
||||||
--do_lower_case \
|
--do_lower_case \
|
||||||
--train_file $SQUAD_TRAIN \
|
--train_file $SQUAD_DIR/train-v1.1.json \
|
||||||
--predict_file $SQUAD_EVAL \
|
--predict_file $SQUAD_DIR/dev-v1.1.json \
|
||||||
--learning_rate 3e-5 \
|
--learning_rate 3e-5 \
|
||||||
--num_train_epochs 2 \
|
--num_train_epochs 2 \
|
||||||
--max_seq_length 384 \
|
--max_seq_length 384 \
|
||||||
--doc_stride 128 \
|
--doc_stride 128 \
|
||||||
--output_dir $OUTPUT_DIR \
|
--output_dir /tmp/debug_squad/ \
|
||||||
--train_batch_size 24 \
|
--train_batch_size 24 \
|
||||||
--fp16 \
|
--fp16 \
|
||||||
--loss_scale 128
|
--loss_scale 128
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ python3 simple_lm_finetuning.py
|
|||||||
--bert_model bert-base-uncased
|
--bert_model bert-base-uncased
|
||||||
--do_lower_case
|
--do_lower_case
|
||||||
--output_dir finetuned_lm/
|
--output_dir finetuned_lm/
|
||||||
|
--do_train
|
||||||
```
|
```
|
||||||
|
|
||||||
### Pregenerating training data
|
### Pregenerating training data
|
||||||
|
|||||||
@@ -123,9 +123,8 @@ def main():
|
|||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument('--pregenerated_data', type=Path, required=True)
|
parser.add_argument('--pregenerated_data', type=Path, required=True)
|
||||||
parser.add_argument('--output_dir', type=Path, required=True)
|
parser.add_argument('--output_dir', type=Path, required=True)
|
||||||
parser.add_argument("--bert_model", type=str, required=True,
|
parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
|
||||||
choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
|
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
|
||||||
"bert-base-multilingual", "bert-base-chinese"])
|
|
||||||
parser.add_argument("--do_lower_case", action="store_true")
|
parser.add_argument("--do_lower_case", action="store_true")
|
||||||
parser.add_argument("--reduce_memory", action="store_true",
|
parser.add_argument("--reduce_memory", action="store_true",
|
||||||
help="Store training data as on-disc memmaps to massively reduce memory usage")
|
help="Store training data as on-disc memmaps to massively reduce memory usage")
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from tqdm import tqdm, trange
|
|||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
import shelve
|
import shelve
|
||||||
|
|
||||||
from random import random, randint, shuffle, choice, sample
|
from random import random, randrange, randint, shuffle, choice, sample
|
||||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import json
|
import json
|
||||||
@@ -30,6 +30,8 @@ class DocumentDatabase:
|
|||||||
self.reduce_memory = reduce_memory
|
self.reduce_memory = reduce_memory
|
||||||
|
|
||||||
def add_document(self, document):
|
def add_document(self, document):
|
||||||
|
if not document:
|
||||||
|
return
|
||||||
if self.reduce_memory:
|
if self.reduce_memory:
|
||||||
current_idx = len(self.doc_lengths)
|
current_idx = len(self.doc_lengths)
|
||||||
self.document_shelf[str(current_idx)] = document
|
self.document_shelf[str(current_idx)] = document
|
||||||
@@ -49,11 +51,11 @@ class DocumentDatabase:
|
|||||||
self._precalculate_doc_weights()
|
self._precalculate_doc_weights()
|
||||||
rand_start = self.doc_cumsum[current_idx]
|
rand_start = self.doc_cumsum[current_idx]
|
||||||
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
|
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
|
||||||
sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
|
sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
|
||||||
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
|
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
|
||||||
else:
|
else:
|
||||||
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
|
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
|
||||||
sampled_doc_index = current_idx + randint(1, len(self.doc_lengths)-1)
|
sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
|
||||||
assert sampled_doc_index != current_idx
|
assert sampled_doc_index != current_idx
|
||||||
if self.reduce_memory:
|
if self.reduce_memory:
|
||||||
return self.document_shelf[str(sampled_doc_index)]
|
return self.document_shelf[str(sampled_doc_index)]
|
||||||
@@ -170,7 +172,7 @@ def create_instances_from_document(
|
|||||||
# (first) sentence.
|
# (first) sentence.
|
||||||
a_end = 1
|
a_end = 1
|
||||||
if len(current_chunk) >= 2:
|
if len(current_chunk) >= 2:
|
||||||
a_end = randint(1, len(current_chunk) - 1)
|
a_end = randrange(1, len(current_chunk))
|
||||||
|
|
||||||
tokens_a = []
|
tokens_a = []
|
||||||
for j in range(a_end):
|
for j in range(a_end):
|
||||||
@@ -186,7 +188,7 @@ def create_instances_from_document(
|
|||||||
# Sample a random document, with longer docs being sampled more frequently
|
# Sample a random document, with longer docs being sampled more frequently
|
||||||
random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
|
random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
|
||||||
|
|
||||||
random_start = randint(0, len(random_document) - 1)
|
random_start = randrange(0, len(random_document))
|
||||||
for j in range(random_start, len(random_document)):
|
for j in range(random_start, len(random_document)):
|
||||||
tokens_b.extend(random_document[j])
|
tokens_b.extend(random_document[j])
|
||||||
if len(tokens_b) >= target_b_length:
|
if len(tokens_b) >= target_b_length:
|
||||||
@@ -264,6 +266,14 @@ def main():
|
|||||||
else:
|
else:
|
||||||
tokens = tokenizer.tokenize(line)
|
tokens = tokenizer.tokenize(line)
|
||||||
doc.append(tokens)
|
doc.append(tokens)
|
||||||
|
if doc:
|
||||||
|
docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added
|
||||||
|
if len(docs) <= 1:
|
||||||
|
exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
|
||||||
|
"ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
|
||||||
|
"indicate breaks between documents in your input file. If your dataset does not contain multiple "
|
||||||
|
"documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
|
||||||
|
"sections or paragraphs.")
|
||||||
|
|
||||||
args.output_dir.mkdir(exist_ok=True)
|
args.output_dir.mkdir(exist_ok=True)
|
||||||
for epoch in trange(args.epochs_to_generate, desc="Epoch"):
|
for epoch in trange(args.epochs_to_generate, desc="Epoch"):
|
||||||
|
|||||||
@@ -35,14 +35,11 @@ from torch.nn import CrossEntropyLoss, MSELoss
|
|||||||
from scipy.stats import pearsonr, spearmanr
|
from scipy.stats import pearsonr, spearmanr
|
||||||
from sklearn.metrics import matthews_corrcoef, f1_score
|
from sklearn.metrics import matthews_corrcoef, f1_score
|
||||||
|
|
||||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
|
||||||
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
|
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
|
||||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
|
||||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
|
||||||
level = logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -95,7 +92,7 @@ class DataProcessor(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def _read_tsv(cls, input_file, quotechar=None):
|
def _read_tsv(cls, input_file, quotechar=None):
|
||||||
"""Reads a tab separated value file."""
|
"""Reads a tab separated value file."""
|
||||||
with open(input_file, "r") as f:
|
with open(input_file, "r", encoding="utf-8") as f:
|
||||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||||
lines = []
|
lines = []
|
||||||
for line in reader:
|
for line in reader:
|
||||||
@@ -697,6 +694,11 @@ def main():
|
|||||||
n_gpu = 1
|
n_gpu = 1
|
||||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||||
torch.distributed.init_process_group(backend='nccl')
|
torch.distributed.init_process_group(backend='nccl')
|
||||||
|
|
||||||
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||||
|
|
||||||
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
||||||
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
||||||
|
|
||||||
@@ -857,18 +859,21 @@ def main():
|
|||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
# Save a trained model and the associated configuration
|
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||||
|
# Save a trained model, configuration and tokenizer
|
||||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
|
||||||
torch.save(model_to_save.state_dict(), output_model_file)
|
|
||||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
|
||||||
with open(output_config_file, 'w') as f:
|
|
||||||
f.write(model_to_save.config.to_json_string())
|
|
||||||
|
|
||||||
# Load a trained model and config that you have fine-tuned
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
config = BertConfig(output_config_file)
|
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||||
model = BertForSequenceClassification(config, num_labels=num_labels)
|
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||||
model.load_state_dict(torch.load(output_model_file))
|
|
||||||
|
torch.save(model_to_save.state_dict(), output_model_file)
|
||||||
|
model_to_save.config.to_json_file(output_config_file)
|
||||||
|
tokenizer.save_vocabulary(args.output_dir)
|
||||||
|
|
||||||
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
|
model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
else:
|
else:
|
||||||
model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
|
model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
|
||||||
model.to(device)
|
model.to(device)
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ def run_model():
|
|||||||
parser.add_argument("--nsamples", type=int, default=1)
|
parser.add_argument("--nsamples", type=int, default=1)
|
||||||
parser.add_argument("--batch_size", type=int, default=-1)
|
parser.add_argument("--batch_size", type=int, default=-1)
|
||||||
parser.add_argument("--length", type=int, default=-1)
|
parser.add_argument("--length", type=int, default=-1)
|
||||||
parser.add_argument("--temperature", type=int, default=1)
|
parser.add_argument("--temperature", type=float, default=1.0)
|
||||||
parser.add_argument("--top_k", type=int, default=0)
|
parser.add_argument("--top_k", type=int, default=0)
|
||||||
parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
|
parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -83,7 +83,8 @@ def run_model():
|
|||||||
elif args.length > model.config.n_ctx:
|
elif args.length > model.config.n_ctx:
|
||||||
raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
|
raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
|
||||||
|
|
||||||
while not args.unconditional:
|
while True:
|
||||||
|
context_tokens = []
|
||||||
if not args.unconditional:
|
if not args.unconditional:
|
||||||
raw_text = input("Model prompt >>> ")
|
raw_text = input("Model prompt >>> ")
|
||||||
while not raw_text:
|
while not raw_text:
|
||||||
@@ -94,8 +95,8 @@ def run_model():
|
|||||||
for _ in range(args.nsamples // args.batch_size):
|
for _ in range(args.nsamples // args.batch_size):
|
||||||
out = sample_sequence(
|
out = sample_sequence(
|
||||||
model=model, length=args.length,
|
model=model, length=args.length,
|
||||||
context=context_tokens if not args.unconditional else None,
|
context=context_tokens,
|
||||||
start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
|
start_token=None,
|
||||||
batch_size=args.batch_size,
|
batch_size=args.batch_size,
|
||||||
temperature=args.temperature, top_k=args.top_k, device=device
|
temperature=args.temperature, top_k=args.top_k, device=device
|
||||||
)
|
)
|
||||||
@@ -106,7 +107,27 @@ def run_model():
|
|||||||
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
|
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
|
||||||
print(text)
|
print(text)
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
if args.unconditional:
|
||||||
|
generated = 0
|
||||||
|
for _ in range(args.nsamples // args.batch_size):
|
||||||
|
out = sample_sequence(
|
||||||
|
model=model, length=args.length,
|
||||||
|
context=None,
|
||||||
|
start_token=enc.encoder['<|endoftext|>'],
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
temperature=args.temperature, top_k=args.top_k, device=device
|
||||||
|
)
|
||||||
|
out = out[:,1:].tolist()
|
||||||
|
for i in range(args.batch_size):
|
||||||
|
generated += 1
|
||||||
|
text = enc.decode(out[i])
|
||||||
|
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
|
||||||
|
print(text)
|
||||||
|
print("=" * 80)
|
||||||
|
if args.unconditional:
|
||||||
|
break
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
run_model()
|
run_model()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,8 @@ import torch
|
|||||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||||
TensorDataset)
|
TensorDataset)
|
||||||
|
|
||||||
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
|
from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
||||||
|
OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
|
||||||
|
|
||||||
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
||||||
|
|
||||||
@@ -218,15 +219,20 @@ def main():
|
|||||||
|
|
||||||
# Save a trained model
|
# Save a trained model
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
|
# Save a trained model, configuration and tokenizer
|
||||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
|
||||||
config = model.config
|
|
||||||
torch.save(model_to_save.state_dict(), output_model_file)
|
|
||||||
|
|
||||||
# Load a trained model that you have fine-tuned
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
model_state_dict = torch.load(output_model_file)
|
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||||
model = OpenAIGPTDoubleHeadsModel(config)
|
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||||
model.load_state_dict(model_state_dict)
|
|
||||||
|
torch.save(model_to_save.state_dict(), output_model_file)
|
||||||
|
model_to_save.config.to_json_file(output_config_file)
|
||||||
|
tokenizer.save_vocabulary(args.output_dir)
|
||||||
|
|
||||||
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
|
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
|
||||||
|
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
|
||||||
model.to(device)
|
model.to(device)
|
||||||
|
|
||||||
if args.do_eval:
|
if args.do_eval:
|
||||||
|
|||||||
@@ -34,8 +34,8 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
|||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
|
||||||
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
|
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
|
||||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||||
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
||||||
BertTokenizer,
|
BertTokenizer,
|
||||||
@@ -46,9 +46,6 @@ if sys.version_info[0] == 2:
|
|||||||
else:
|
else:
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
|
||||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
|
||||||
level = logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -837,7 +834,17 @@ def main():
|
|||||||
parser.add_argument('--null_score_diff_threshold',
|
parser.add_argument('--null_score_diff_threshold',
|
||||||
type=float, default=0.0,
|
type=float, default=0.0,
|
||||||
help="If null_score - best_non_null is greater than the threshold predict null.")
|
help="If null_score - best_non_null is greater than the threshold predict null.")
|
||||||
|
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||||
|
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
if args.server_ip and args.server_port:
|
||||||
|
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||||
|
import ptvsd
|
||||||
|
print("Waiting for debugger attach")
|
||||||
|
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||||
|
ptvsd.wait_for_attach()
|
||||||
|
|
||||||
if args.local_rank == -1 or args.no_cuda:
|
if args.local_rank == -1 or args.no_cuda:
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||||
@@ -848,6 +855,11 @@ def main():
|
|||||||
n_gpu = 1
|
n_gpu = 1
|
||||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||||
torch.distributed.init_process_group(backend='nccl')
|
torch.distributed.init_process_group(backend='nccl')
|
||||||
|
|
||||||
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
|
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||||
|
|
||||||
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
||||||
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
||||||
|
|
||||||
@@ -983,7 +995,7 @@ def main():
|
|||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||||
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
|
||||||
if n_gpu == 1:
|
if n_gpu == 1:
|
||||||
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
|
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
|
||||||
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
|
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
|
||||||
@@ -1008,19 +1020,21 @@ def main():
|
|||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
if args.do_train:
|
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||||
# Save a trained model and the associated configuration
|
# Save a trained model, configuration and tokenizer
|
||||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
|
||||||
torch.save(model_to_save.state_dict(), output_model_file)
|
|
||||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
|
||||||
with open(output_config_file, 'w') as f:
|
|
||||||
f.write(model_to_save.config.to_json_string())
|
|
||||||
|
|
||||||
# Load a trained model and config that you have fine-tuned
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
config = BertConfig(output_config_file)
|
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||||
model = BertForQuestionAnswering(config)
|
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||||
model.load_state_dict(torch.load(output_model_file))
|
|
||||||
|
torch.save(model_to_save.state_dict(), output_model_file)
|
||||||
|
model_to_save.config.to_json_file(output_config_file)
|
||||||
|
tokenizer.save_vocabulary(args.output_dir)
|
||||||
|
|
||||||
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
|
model = BertForQuestionAnswering.from_pretrained(args.output_dir)
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
else:
|
else:
|
||||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
|
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
|
||||||
|
|
||||||
@@ -1054,7 +1068,7 @@ def main():
|
|||||||
model.eval()
|
model.eval()
|
||||||
all_results = []
|
all_results = []
|
||||||
logger.info("Start evaluating")
|
logger.info("Start evaluating")
|
||||||
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
|
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
|
||||||
if len(all_results) % 1000 == 0:
|
if len(all_results) % 1000 == 0:
|
||||||
logger.info("Processing example: %d" % (len(all_results)))
|
logger.info("Processing example: %d" % (len(all_results)))
|
||||||
input_ids = input_ids.to(device)
|
input_ids = input_ids.to(device)
|
||||||
|
|||||||
@@ -32,8 +32,8 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
|||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
|
||||||
from pytorch_pretrained_bert.modeling import (BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME)
|
from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
|
||||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||||
|
|
||||||
@@ -473,18 +473,20 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
# Save a trained model and the associated configuration
|
# Save a trained model, configuration and tokenizer
|
||||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
|
||||||
torch.save(model_to_save.state_dict(), output_model_file)
|
|
||||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
|
||||||
with open(output_config_file, 'w') as f:
|
|
||||||
f.write(model_to_save.config.to_json_string())
|
|
||||||
|
|
||||||
# Load a trained model and config that you have fine-tuned
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
config = BertConfig(output_config_file)
|
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||||
model = BertForMultipleChoice(config, num_choices=4)
|
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||||
model.load_state_dict(torch.load(output_model_file))
|
|
||||||
|
torch.save(model_to_save.state_dict(), output_model_file)
|
||||||
|
model_to_save.config.to_json_file(output_config_file)
|
||||||
|
tokenizer.save_vocabulary(args.output_dir)
|
||||||
|
|
||||||
|
# Load a trained model and vocabulary that you have fine-tuned
|
||||||
|
model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||||
else:
|
else:
|
||||||
model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
|
model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
|
||||||
model.to(device)
|
model.to(device)
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import math
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
|
from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
@@ -80,6 +80,7 @@ def main():
|
|||||||
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
|
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
|
||||||
# and tokenizing the dataset
|
# and tokenizing the dataset
|
||||||
# The pre-processed corpus is a convertion (using the conversion script )
|
# The pre-processed corpus is a convertion (using the conversion script )
|
||||||
|
tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)
|
||||||
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
||||||
ntokens = len(corpus.vocab)
|
ntokens = len(corpus.vocab)
|
||||||
|
|
||||||
|
|||||||
@@ -21,4 +21,4 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model,
|
|||||||
from .optimization import BertAdam
|
from .optimization import BertAdam
|
||||||
from .optimization_openai import OpenAIAdam
|
from .optimization_openai import OpenAIAdam
|
||||||
|
|
||||||
from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
|
from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
|
||||||
|
|||||||
@@ -5,11 +5,13 @@ Copyright by the AllenNLP authors.
|
|||||||
"""
|
"""
|
||||||
from __future__ import (absolute_import, division, print_function, unicode_literals)
|
from __future__ import (absolute_import, division, print_function, unicode_literals)
|
||||||
|
|
||||||
|
import sys
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import fnmatch
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
import sys
|
import sys
|
||||||
@@ -33,6 +35,9 @@ except (AttributeError, ImportError):
|
|||||||
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
||||||
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
|
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
|
||||||
|
|
||||||
|
CONFIG_NAME = "config.json"
|
||||||
|
WEIGHTS_NAME = "pytorch_model.bin"
|
||||||
|
|
||||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
|
||||||
@@ -188,17 +193,30 @@ def get_from_cache(url, cache_dir=None):
|
|||||||
if url.startswith("s3://"):
|
if url.startswith("s3://"):
|
||||||
etag = s3_etag(url)
|
etag = s3_etag(url)
|
||||||
else:
|
else:
|
||||||
|
try:
|
||||||
response = requests.head(url, allow_redirects=True)
|
response = requests.head(url, allow_redirects=True)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
raise IOError("HEAD request failed for url {} with status code {}"
|
etag = None
|
||||||
.format(url, response.status_code))
|
else:
|
||||||
etag = response.headers.get("ETag")
|
etag = response.headers.get("ETag")
|
||||||
|
except EnvironmentError:
|
||||||
|
etag = None
|
||||||
|
|
||||||
|
if sys.version_info[0] == 2 and etag is not None:
|
||||||
|
etag = etag.decode('utf-8')
|
||||||
filename = url_to_filename(url, etag)
|
filename = url_to_filename(url, etag)
|
||||||
|
|
||||||
# get cache path to put the file
|
# get cache path to put the file
|
||||||
cache_path = os.path.join(cache_dir, filename)
|
cache_path = os.path.join(cache_dir, filename)
|
||||||
|
|
||||||
|
# If we don't have a connection (etag is None) and can't identify the file
|
||||||
|
# try to get the last downloaded one
|
||||||
|
if not os.path.exists(cache_path) and etag is None:
|
||||||
|
matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
|
||||||
|
matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
|
||||||
|
if matching_files:
|
||||||
|
cache_path = os.path.join(cache_dir, matching_files[-1])
|
||||||
|
|
||||||
if not os.path.exists(cache_path):
|
if not os.path.exists(cache_path):
|
||||||
# Download to temporary file, then copy to cache dir once finished.
|
# Download to temporary file, then copy to cache dir once finished.
|
||||||
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
||||||
@@ -223,8 +241,11 @@ def get_from_cache(url, cache_dir=None):
|
|||||||
logger.info("creating metadata file for %s", cache_path)
|
logger.info("creating metadata file for %s", cache_path)
|
||||||
meta = {'url': url, 'etag': etag}
|
meta = {'url': url, 'etag': etag}
|
||||||
meta_path = cache_path + '.json'
|
meta_path = cache_path + '.json'
|
||||||
with open(meta_path, 'w', encoding="utf-8") as meta_file:
|
with open(meta_path, 'w') as meta_file:
|
||||||
json.dump(meta, meta_file)
|
output_string = json.dumps(meta)
|
||||||
|
if sys.version_info[0] == 2 and isinstance(output_string, str):
|
||||||
|
output_string = unicode(output_string, 'utf-8') # The beauty of python 2
|
||||||
|
meta_file.write(output_string)
|
||||||
|
|
||||||
logger.info("removing temp file %s", temp_file.name)
|
logger.info("removing temp file %s", temp_file.name)
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
|
|
||||||
from .file_utils import cached_path
|
from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -45,8 +45,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
|
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
|
||||||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
|
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
|
||||||
}
|
}
|
||||||
CONFIG_NAME = 'bert_config.json'
|
BERT_CONFIG_NAME = 'bert_config.json'
|
||||||
WEIGHTS_NAME = 'pytorch_model.bin'
|
|
||||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||||
|
|
||||||
def load_tf_weights_in_bert(model, tf_checkpoint_path):
|
def load_tf_weights_in_bert(model, tf_checkpoint_path):
|
||||||
@@ -220,6 +219,11 @@ class BertConfig(object):
|
|||||||
"""Serializes this instance to a JSON string."""
|
"""Serializes this instance to a JSON string."""
|
||||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path):
|
||||||
|
""" Save this instance to a json file."""
|
||||||
|
with open(json_file_path, "w", encoding='utf-8') as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -581,13 +585,16 @@ class BertPreTrainedModel(nn.Module):
|
|||||||
serialization_dir = tempdir
|
serialization_dir = tempdir
|
||||||
# Load config
|
# Load config
|
||||||
config_file = os.path.join(serialization_dir, CONFIG_NAME)
|
config_file = os.path.join(serialization_dir, CONFIG_NAME)
|
||||||
|
if not os.path.exists(config_file):
|
||||||
|
# Backward compatibility with old naming format
|
||||||
|
config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
|
||||||
config = BertConfig.from_json_file(config_file)
|
config = BertConfig.from_json_file(config_file)
|
||||||
logger.info("Model config {}".format(config))
|
logger.info("Model config {}".format(config))
|
||||||
# Instantiate model.
|
# Instantiate model.
|
||||||
model = cls(config, *inputs, **kwargs)
|
model = cls(config, *inputs, **kwargs)
|
||||||
if state_dict is None and not from_tf:
|
if state_dict is None and not from_tf:
|
||||||
weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
|
weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
|
||||||
state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
|
state_dict = torch.load(weights_path, map_location='cpu')
|
||||||
if tempdir:
|
if tempdir:
|
||||||
# Clean up temp dir
|
# Clean up temp dir
|
||||||
shutil.rmtree(tempdir)
|
shutil.rmtree(tempdir)
|
||||||
@@ -930,7 +937,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
Inputs:
|
Inputs:
|
||||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
|
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
|
||||||
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
|
with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
|
||||||
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
|
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
|
||||||
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
|
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
|
||||||
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
|
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ import torch.nn as nn
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from .file_utils import cached_path
|
from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
|
||||||
from .modeling import BertLayerNorm as LayerNorm
|
from .modeling import BertLayerNorm as LayerNorm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -42,9 +42,6 @@ logger = logging.getLogger(__name__)
|
|||||||
PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
|
PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
|
||||||
PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
|
PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
|
||||||
|
|
||||||
CONFIG_NAME = "config.json"
|
|
||||||
WEIGHTS_NAME = "pytorch_model.bin"
|
|
||||||
|
|
||||||
def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
|
def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
|
||||||
""" Load tf checkpoints in a pytorch model
|
""" Load tf checkpoints in a pytorch model
|
||||||
"""
|
"""
|
||||||
@@ -180,6 +177,11 @@ class GPT2Config(object):
|
|||||||
"""Serializes this instance to a JSON string."""
|
"""Serializes this instance to a JSON string."""
|
||||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path):
|
||||||
|
""" Save this instance to a json file."""
|
||||||
|
with open(json_file_path, "w", encoding='utf-8') as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
|
|
||||||
|
|
||||||
class Conv1D(nn.Module):
|
class Conv1D(nn.Module):
|
||||||
def __init__(self, nf, nx):
|
def __init__(self, nf, nx):
|
||||||
@@ -216,7 +218,7 @@ class Attention(nn.Module):
|
|||||||
w = w / math.sqrt(v.size(-1))
|
w = w / math.sqrt(v.size(-1))
|
||||||
nd, ns = w.size(-2), w.size(-1)
|
nd, ns = w.size(-2), w.size(-1)
|
||||||
b = self.bias[:, :, ns-nd:ns, :ns]
|
b = self.bias[:, :, ns-nd:ns, :ns]
|
||||||
w = w * b - 1e10 * (1 - b)
|
w = w * b - 1e4 * (1 - b)
|
||||||
|
|
||||||
w = nn.Softmax(dim=-1)(w)
|
w = nn.Softmax(dim=-1)(w)
|
||||||
return torch.matmul(w, v)
|
return torch.matmul(w, v)
|
||||||
@@ -416,7 +418,7 @@ class GPT2PreTrainedModel(nn.Module):
|
|||||||
# Instantiate model.
|
# Instantiate model.
|
||||||
model = cls(config, *inputs, **kwargs)
|
model = cls(config, *inputs, **kwargs)
|
||||||
if state_dict is None and not from_tf:
|
if state_dict is None and not from_tf:
|
||||||
state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
|
state_dict = torch.load(resolved_archive_file, map_location='cpu')
|
||||||
if from_tf:
|
if from_tf:
|
||||||
# Directly load from a TensorFlow checkpoint (stored as NumPy array)
|
# Directly load from a TensorFlow checkpoint (stored as NumPy array)
|
||||||
return load_tf_weights_in_gpt2(model, resolved_archive_file)
|
return load_tf_weights_in_gpt2(model, resolved_archive_file)
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ import torch.nn as nn
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from .file_utils import cached_path
|
from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
|
||||||
from .modeling import BertLayerNorm as LayerNorm
|
from .modeling import BertLayerNorm as LayerNorm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -42,8 +42,6 @@ logger = logging.getLogger(__name__)
|
|||||||
PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
|
PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
|
||||||
PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
|
PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
|
||||||
|
|
||||||
CONFIG_NAME = "config.json"
|
|
||||||
WEIGHTS_NAME = "pytorch_model.bin"
|
|
||||||
|
|
||||||
def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
|
def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
|
||||||
""" Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
|
""" Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
|
||||||
@@ -225,6 +223,11 @@ class OpenAIGPTConfig(object):
|
|||||||
"""Serializes this instance to a JSON string."""
|
"""Serializes this instance to a JSON string."""
|
||||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path):
|
||||||
|
""" Save this instance to a json file."""
|
||||||
|
with open(json_file_path, "w", encoding='utf-8') as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
|
|
||||||
|
|
||||||
class Conv1D(nn.Module):
|
class Conv1D(nn.Module):
|
||||||
def __init__(self, nf, rf, nx):
|
def __init__(self, nf, rf, nx):
|
||||||
@@ -473,7 +476,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
|
|||||||
# Instantiate model.
|
# Instantiate model.
|
||||||
model = cls(config, *inputs, **kwargs)
|
model = cls(config, *inputs, **kwargs)
|
||||||
if state_dict is None and not from_tf:
|
if state_dict is None and not from_tf:
|
||||||
state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
|
state_dict = torch.load(resolved_archive_file, map_location='cpu')
|
||||||
if from_tf:
|
if from_tf:
|
||||||
# Directly load from a TensorFlow checkpoint (stored as NumPy array)
|
# Directly load from a TensorFlow checkpoint (stored as NumPy array)
|
||||||
return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
|
return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
|
||||||
@@ -605,14 +608,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
return
|
return
|
||||||
# Update config
|
# Update config
|
||||||
self.config.n_special = num_special_tokens
|
self.config.n_special = num_special_tokens
|
||||||
# # Build new embeddings and initialize
|
# Build new embeddings and initialize all new embeddings (in particular the special tokens)
|
||||||
old_embed = self.tokens_embed
|
old_embed = self.tokens_embed
|
||||||
self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
|
self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
|
||||||
# Initialize all new embeddings (in particular the special tokens)
|
self.tokens_embed.to(old_embed.weight.device)
|
||||||
self.init_weights(self.tokens_embed)
|
self.init_weights(self.tokens_embed)
|
||||||
# Copy word and positional embeddings from the previous weights
|
# Copy word embeddings from the previous weights
|
||||||
self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
|
self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
|
||||||
self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
|
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None):
|
def forward(self, input_ids, position_ids=None, token_type_ids=None):
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
@@ -717,9 +719,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
# Shift so that tokens < n predict n
|
# Shift so that tokens < n predict n
|
||||||
shift_logits = lm_logits[:, :-1].contiguous()
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
shift_labels = lm_labels[:, 1:].contiguous()
|
shift_labels = lm_labels[..., 1:].contiguous()
|
||||||
|
|
||||||
# Flatten the tokens
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
@@ -809,11 +810,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
||||||
losses = []
|
losses = []
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
shift_logits = lm_logits[:, :-1].contiguous()
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
shift_labels = lm_labels[:, 1:].contiguous()
|
shift_labels = lm_labels[..., 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
losses.append(loss_fct(shift_logits.view(-1,
|
losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
|
||||||
shift_logits.size(-1)), shift_labels.view(-1)))
|
|
||||||
if mc_labels is not None:
|
if mc_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss()
|
loss_fct = CrossEntropyLoss()
|
||||||
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ from torch.nn.parameter import Parameter
|
|||||||
|
|
||||||
from .modeling import BertLayerNorm as LayerNorm
|
from .modeling import BertLayerNorm as LayerNorm
|
||||||
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
|
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
|
||||||
from .file_utils import cached_path
|
from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -50,8 +50,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
|
|||||||
PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
|
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
|
||||||
}
|
}
|
||||||
CONFIG_NAME = 'config.json'
|
|
||||||
WEIGHTS_NAME = 'pytorch_model.bin'
|
|
||||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||||
|
|
||||||
def build_tf_to_pytorch_map(model, config):
|
def build_tf_to_pytorch_map(model, config):
|
||||||
@@ -316,6 +315,11 @@ class TransfoXLConfig(object):
|
|||||||
"""Serializes this instance to a JSON string."""
|
"""Serializes this instance to a JSON string."""
|
||||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path):
|
||||||
|
""" Save this instance to a json file."""
|
||||||
|
with open(json_file_path, "w", encoding='utf-8') as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
|
|
||||||
|
|
||||||
class PositionalEmbedding(nn.Module):
|
class PositionalEmbedding(nn.Module):
|
||||||
def __init__(self, demb):
|
def __init__(self, demb):
|
||||||
@@ -940,7 +944,7 @@ class TransfoXLPreTrainedModel(nn.Module):
|
|||||||
# Instantiate model.
|
# Instantiate model.
|
||||||
model = cls(config, *inputs, **kwargs)
|
model = cls(config, *inputs, **kwargs)
|
||||||
if state_dict is None and not from_tf:
|
if state_dict is None and not from_tf:
|
||||||
state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
|
state_dict = torch.load(resolved_archive_file, map_location='cpu')
|
||||||
if from_tf:
|
if from_tf:
|
||||||
# Directly load from a TensorFlow checkpoint
|
# Directly load from a TensorFlow checkpoint
|
||||||
return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path)
|
return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path)
|
||||||
|
|||||||
@@ -134,6 +134,21 @@ class BertTokenizer(object):
|
|||||||
tokens.append(self.ids_to_tokens[i])
|
tokens.append(self.ids_to_tokens[i])
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
def save_vocabulary(self, vocab_path):
|
||||||
|
"""Save the tokenizer vocabulary to a directory or file."""
|
||||||
|
index = 0
|
||||||
|
if os.path.isdir(vocab_path):
|
||||||
|
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
||||||
|
with open(vocab_file, "w", encoding="utf-8") as writer:
|
||||||
|
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
||||||
|
if index != token_index:
|
||||||
|
logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
|
||||||
|
" Please check that the vocabulary is not corrupted!".format(vocab_file))
|
||||||
|
index = token_index
|
||||||
|
writer.write(token + u'\n')
|
||||||
|
index += 1
|
||||||
|
return vocab_file
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
from __future__ import (absolute_import, division, print_function,
|
from __future__ import (absolute_import, division, print_function,
|
||||||
unicode_literals)
|
unicode_literals)
|
||||||
|
|
||||||
|
import sys
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -45,6 +46,7 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
|||||||
}
|
}
|
||||||
VOCAB_NAME = 'vocab.json'
|
VOCAB_NAME = 'vocab.json'
|
||||||
MERGES_NAME = 'merges.txt'
|
MERGES_NAME = 'merges.txt'
|
||||||
|
SPECIAL_TOKENS_NAME = 'special_tokens.txt'
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def bytes_to_unicode():
|
def bytes_to_unicode():
|
||||||
@@ -57,6 +59,7 @@ def bytes_to_unicode():
|
|||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
"""
|
"""
|
||||||
|
_chr = unichr if sys.version_info[0] == 2 else chr
|
||||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||||
cs = bs[:]
|
cs = bs[:]
|
||||||
n = 0
|
n = 0
|
||||||
@@ -65,7 +68,7 @@ def bytes_to_unicode():
|
|||||||
bs.append(b)
|
bs.append(b)
|
||||||
cs.append(2**8+n)
|
cs.append(2**8+n)
|
||||||
n += 1
|
n += 1
|
||||||
cs = [chr(n) for n in cs]
|
cs = [_chr(n) for n in cs]
|
||||||
return dict(zip(bs, cs))
|
return dict(zip(bs, cs))
|
||||||
|
|
||||||
def get_pairs(word):
|
def get_pairs(word):
|
||||||
@@ -94,9 +97,15 @@ class GPT2Tokenizer(object):
|
|||||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||||
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||||
|
special_tokens_file = None
|
||||||
else:
|
else:
|
||||||
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
||||||
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
||||||
|
special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
|
||||||
|
if not os.path.exists(special_tokens_file):
|
||||||
|
special_tokens_file = None
|
||||||
|
else:
|
||||||
|
logger.info("loading special tokens file {}".format(special_tokens_file))
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||||
@@ -125,10 +134,14 @@ class GPT2Tokenizer(object):
|
|||||||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
|
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
|
||||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||||
# Instantiate tokenizer.
|
# Instantiate tokenizer.
|
||||||
tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
|
if special_tokens_file and 'special_tokens' not in kwargs:
|
||||||
|
special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
|
||||||
|
else:
|
||||||
|
special_tokens = kwargs.pop('special_tokens', [])
|
||||||
|
tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
def __init__(self, vocab_file, merges_file, errors='replace', max_len=None):
|
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
|
||||||
self.max_len = max_len if max_len is not None else int(1e12)
|
self.max_len = max_len if max_len is not None else int(1e12)
|
||||||
self.encoder = json.load(open(vocab_file))
|
self.encoder = json.load(open(vocab_file))
|
||||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||||
@@ -143,8 +156,25 @@ class GPT2Tokenizer(object):
|
|||||||
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||||
|
|
||||||
|
self.special_tokens = {}
|
||||||
|
self.special_tokens_decoder = {}
|
||||||
|
self.set_special_tokens(special_tokens)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.encoder)
|
return len(self.encoder) + len(self.special_tokens)
|
||||||
|
|
||||||
|
def set_special_tokens(self, special_tokens):
|
||||||
|
""" Add a list of additional tokens to the encoder.
|
||||||
|
The additional tokens are indexed starting from the last index of the
|
||||||
|
current vocabulary in the order of the `special_tokens` list.
|
||||||
|
"""
|
||||||
|
if not special_tokens:
|
||||||
|
self.special_tokens = {}
|
||||||
|
self.special_tokens_decoder = {}
|
||||||
|
return
|
||||||
|
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
|
||||||
|
self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
|
||||||
|
logger.info("Special tokens {}".format(self.special_tokens))
|
||||||
|
|
||||||
def bpe(self, token):
|
def bpe(self, token):
|
||||||
if token in self.cache:
|
if token in self.cache:
|
||||||
@@ -187,20 +217,85 @@ class GPT2Tokenizer(object):
|
|||||||
self.cache[token] = word
|
self.cache[token] = word
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def encode(self, text):
|
def tokenize(self, text):
|
||||||
|
""" Tokenize a string. """
|
||||||
bpe_tokens = []
|
bpe_tokens = []
|
||||||
for token in re.findall(self.pat, text):
|
for token in re.findall(self.pat, text):
|
||||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
||||||
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
|
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||||
if len(bpe_tokens) > self.max_len:
|
return bpe_tokens
|
||||||
|
|
||||||
|
def convert_tokens_to_ids(self, tokens):
|
||||||
|
""" Converts a sequence of tokens into ids using the vocab. """
|
||||||
|
ids = []
|
||||||
|
if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
|
||||||
|
if tokens in self.special_tokens:
|
||||||
|
return self.special_tokens[tokens]
|
||||||
|
else:
|
||||||
|
return self.encoder.get(tokens, 0)
|
||||||
|
for token in tokens:
|
||||||
|
if token in self.special_tokens:
|
||||||
|
ids.append(self.special_tokens[token])
|
||||||
|
else:
|
||||||
|
ids.append(self.encoder.get(token, 0))
|
||||||
|
if len(ids) > self.max_len:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Token indices sequence length is longer than the specified maximum "
|
"Token indices sequence length is longer than the specified maximum "
|
||||||
" sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
|
" sequence length for this OpenAI GPT model ({} > {}). Running this"
|
||||||
" sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
|
" sequence through the model will result in indexing errors".format(len(ids), self.max_len)
|
||||||
)
|
)
|
||||||
return bpe_tokens
|
return ids
|
||||||
|
|
||||||
|
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||||
|
"""Converts a sequence of ids in BPE tokens using the vocab."""
|
||||||
|
tokens = []
|
||||||
|
for i in ids:
|
||||||
|
if i in self.special_tokens_decoder:
|
||||||
|
if not skip_special_tokens:
|
||||||
|
tokens.append(self.special_tokens_decoder[i])
|
||||||
|
else:
|
||||||
|
tokens.append(self.decoder[i])
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def encode(self, text):
|
||||||
|
return self.convert_tokens_to_ids(self.tokenize(text))
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
text = ''.join([self.decoder[token] for token in tokens])
|
text = ''.join([self.decoder[token] for token in tokens])
|
||||||
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
|
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def save_vocabulary(self, vocab_path):
|
||||||
|
"""Save the tokenizer vocabulary and merge files to a directory."""
|
||||||
|
if not os.path.isdir(vocab_path):
|
||||||
|
logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
|
||||||
|
return
|
||||||
|
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
||||||
|
merge_file = os.path.join(vocab_path, MERGES_NAME)
|
||||||
|
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
|
||||||
|
|
||||||
|
with open(vocab_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
with open(merge_file, "w", encoding="utf-8") as writer:
|
||||||
|
writer.write(u'#version: 0.2\n')
|
||||||
|
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
||||||
|
if index != token_index:
|
||||||
|
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
|
||||||
|
" Please check that the tokenizer is not corrupted!".format(merge_file))
|
||||||
|
index = token_index
|
||||||
|
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
index = len(self.encoder)
|
||||||
|
with open(special_tokens_file, 'w', encoding='utf-8') as writer:
|
||||||
|
for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
|
||||||
|
if index != token_index:
|
||||||
|
logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
|
||||||
|
" Please check that the tokenizer is not corrupted!".format(special_tokens_file))
|
||||||
|
index = token_index
|
||||||
|
writer.write(token + u'\n')
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
return vocab_file, merge_file, special_tokens_file
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
|||||||
}
|
}
|
||||||
VOCAB_NAME = 'vocab.json'
|
VOCAB_NAME = 'vocab.json'
|
||||||
MERGES_NAME = 'merges.txt'
|
MERGES_NAME = 'merges.txt'
|
||||||
|
SPECIAL_TOKENS_NAME = 'special_tokens.txt'
|
||||||
|
|
||||||
def get_pairs(word):
|
def get_pairs(word):
|
||||||
"""
|
"""
|
||||||
@@ -86,9 +87,15 @@ class OpenAIGPTTokenizer(object):
|
|||||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||||
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||||
|
special_tokens_file = None
|
||||||
else:
|
else:
|
||||||
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
||||||
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
||||||
|
special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
|
||||||
|
if not os.path.exists(special_tokens_file):
|
||||||
|
special_tokens_file = None
|
||||||
|
else:
|
||||||
|
logger.info("loading special tokens file {}".format(special_tokens_file))
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||||
@@ -117,7 +124,11 @@ class OpenAIGPTTokenizer(object):
|
|||||||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
|
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
|
||||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||||
# Instantiate tokenizer.
|
# Instantiate tokenizer.
|
||||||
tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
|
if special_tokens_file and 'special_tokens' not in kwargs:
|
||||||
|
special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
|
||||||
|
else:
|
||||||
|
special_tokens = kwargs.pop('special_tokens', [])
|
||||||
|
tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
|
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
|
||||||
@@ -139,6 +150,8 @@ class OpenAIGPTTokenizer(object):
|
|||||||
merges = [tuple(merge.split()) for merge in merges]
|
merges = [tuple(merge.split()) for merge in merges]
|
||||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
self.special_tokens = {}
|
||||||
|
self.special_tokens_decoder = {}
|
||||||
self.set_special_tokens(special_tokens)
|
self.set_special_tokens(special_tokens)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
@@ -250,14 +263,51 @@ class OpenAIGPTTokenizer(object):
|
|||||||
tokens.append(self.decoder[i])
|
tokens.append(self.decoder[i])
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
|
def encode(self, text):
|
||||||
|
return self.convert_tokens_to_ids(self.tokenize(text))
|
||||||
|
|
||||||
|
def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
|
||||||
"""Converts a sequence of ids in a string."""
|
"""Converts a sequence of ids in a string."""
|
||||||
tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
|
tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
|
||||||
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
||||||
if clean_up_tokenization_spaces:
|
if clean_up_tokenization_spaces:
|
||||||
out_string = out_string.replace('<unk>', '')
|
out_string = out_string.replace('<unk>', '')
|
||||||
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
|
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
|
||||||
).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
|
).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
|
||||||
).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
|
).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
|
||||||
).replace(" 've", "'ve")
|
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
|
def save_vocabulary(self, vocab_path):
|
||||||
|
"""Save the tokenizer vocabulary and merge files to a directory."""
|
||||||
|
if not os.path.isdir(vocab_path):
|
||||||
|
logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
|
||||||
|
return
|
||||||
|
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
||||||
|
merge_file = os.path.join(vocab_path, MERGES_NAME)
|
||||||
|
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
|
||||||
|
|
||||||
|
with open(vocab_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
with open(merge_file, "w", encoding="utf-8") as writer:
|
||||||
|
writer.write(u'#version: 0.2\n')
|
||||||
|
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
||||||
|
if index != token_index:
|
||||||
|
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
|
||||||
|
" Please check that the tokenizer is not corrupted!".format(merge_file))
|
||||||
|
index = token_index
|
||||||
|
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
index = len(self.encoder)
|
||||||
|
with open(special_tokens_file, 'w', encoding='utf-8') as writer:
|
||||||
|
for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
|
||||||
|
if index != token_index:
|
||||||
|
logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
|
||||||
|
" Please check that the tokenizer is not corrupted!".format(special_tokens_file))
|
||||||
|
index = token_index
|
||||||
|
writer.write(token + u'\n')
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
return vocab_file, merge_file, special_tokens_file
|
||||||
|
|||||||
@@ -63,7 +63,10 @@ class TransfoXLTokenizer(object):
|
|||||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||||
else:
|
else:
|
||||||
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
||||||
|
else:
|
||||||
|
vocab_file = pretrained_model_name_or_path
|
||||||
# redirect to the cache, if necessary
|
# redirect to the cache, if necessary
|
||||||
try:
|
try:
|
||||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||||
@@ -141,6 +144,14 @@ class TransfoXLTokenizer(object):
|
|||||||
else:
|
else:
|
||||||
raise ValueError('No <unkown> token in vocabulary')
|
raise ValueError('No <unkown> token in vocabulary')
|
||||||
|
|
||||||
|
def save_vocabulary(self, vocab_path):
|
||||||
|
"""Save the tokenizer vocabulary to a directory or file."""
|
||||||
|
index = 0
|
||||||
|
if os.path.isdir(vocab_path):
|
||||||
|
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
||||||
|
torch.save(self.__dict__, vocab_file)
|
||||||
|
return vocab_file
|
||||||
|
|
||||||
def build_vocab(self):
|
def build_vocab(self):
|
||||||
if self.vocab_file:
|
if self.vocab_file:
|
||||||
print('building vocab from {}'.format(self.vocab_file))
|
print('building vocab from {}'.format(self.vocab_file))
|
||||||
@@ -245,82 +256,24 @@ class TransfoXLTokenizer(object):
|
|||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.idx2sym)
|
return len(self.idx2sym)
|
||||||
|
|
||||||
def _run_split_on_punc(self, text):
|
|
||||||
"""Splits punctuation on a piece of text."""
|
|
||||||
if text in self.never_split:
|
|
||||||
return [text]
|
|
||||||
chars = list(text)
|
|
||||||
i = 0
|
|
||||||
start_new_word = True
|
|
||||||
output = []
|
|
||||||
while i < len(chars):
|
|
||||||
char = chars[i]
|
|
||||||
if _is_punctuation(char):
|
|
||||||
output.append([char])
|
|
||||||
start_new_word = True
|
|
||||||
else:
|
|
||||||
if start_new_word:
|
|
||||||
output.append([])
|
|
||||||
start_new_word = False
|
|
||||||
output[-1].append(char)
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return ["".join(x) for x in output]
|
|
||||||
|
|
||||||
def _run_strip_accents(self, text):
|
|
||||||
"""Strips accents from a piece of text."""
|
|
||||||
text = unicodedata.normalize("NFD", text)
|
|
||||||
output = []
|
|
||||||
for char in text:
|
|
||||||
cat = unicodedata.category(char)
|
|
||||||
if cat == "Mn":
|
|
||||||
continue
|
|
||||||
output.append(char)
|
|
||||||
return "".join(output)
|
|
||||||
|
|
||||||
def _clean_text(self, text):
|
|
||||||
"""Performs invalid character removal and whitespace cleanup on text."""
|
|
||||||
output = []
|
|
||||||
for char in text:
|
|
||||||
cp = ord(char)
|
|
||||||
if cp == 0 or cp == 0xfffd or _is_control(char):
|
|
||||||
continue
|
|
||||||
if _is_whitespace(char):
|
|
||||||
output.append(" ")
|
|
||||||
else:
|
|
||||||
output.append(char)
|
|
||||||
return "".join(output)
|
|
||||||
|
|
||||||
def whitespace_tokenize(self, text):
|
|
||||||
"""Runs basic whitespace cleaning and splitting on a piece of text."""
|
|
||||||
text = text.strip()
|
|
||||||
if not text:
|
|
||||||
return []
|
|
||||||
if self.delimiter == '':
|
|
||||||
tokens = text
|
|
||||||
else:
|
|
||||||
tokens = text.split(self.delimiter)
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
def tokenize(self, line, add_eos=False, add_double_eos=False):
|
def tokenize(self, line, add_eos=False, add_double_eos=False):
|
||||||
line = self._clean_text(line)
|
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
# convert to lower case
|
||||||
|
if self.lower_case:
|
||||||
|
line = line.lower()
|
||||||
|
|
||||||
symbols = self.whitespace_tokenize(line)
|
# empty delimiter '' will evaluate False
|
||||||
|
if self.delimiter == '':
|
||||||
split_symbols = []
|
symbols = line
|
||||||
for symbol in symbols:
|
else:
|
||||||
if self.lower_case and symbol not in self.never_split:
|
symbols = line.split(self.delimiter)
|
||||||
symbol = symbol.lower()
|
|
||||||
symbol = self._run_strip_accents(symbol)
|
|
||||||
split_symbols.extend(self._run_split_on_punc(symbol))
|
|
||||||
|
|
||||||
if add_double_eos: # lm1b
|
if add_double_eos: # lm1b
|
||||||
return ['<S>'] + split_symbols + ['<S>']
|
return ['<S>'] + symbols + ['<S>']
|
||||||
elif add_eos:
|
elif add_eos:
|
||||||
return split_symbols + ['<eos>']
|
return symbols + ['<eos>']
|
||||||
else:
|
else:
|
||||||
return split_symbols
|
return symbols
|
||||||
|
|
||||||
|
|
||||||
class LMOrderedIterator(object):
|
class LMOrderedIterator(object):
|
||||||
@@ -631,42 +584,3 @@ def get_lm_corpus(datadir, dataset):
|
|||||||
torch.save(corpus, fn)
|
torch.save(corpus, fn)
|
||||||
|
|
||||||
return corpus
|
return corpus
|
||||||
|
|
||||||
def _is_whitespace(char):
|
|
||||||
"""Checks whether `chars` is a whitespace character."""
|
|
||||||
# \t, \n, and \r are technically contorl characters but we treat them
|
|
||||||
# as whitespace since they are generally considered as such.
|
|
||||||
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
|
||||||
return True
|
|
||||||
cat = unicodedata.category(char)
|
|
||||||
if cat == "Zs":
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _is_control(char):
|
|
||||||
"""Checks whether `chars` is a control character."""
|
|
||||||
# These are technically control characters but we count them as whitespace
|
|
||||||
# characters.
|
|
||||||
if char == "\t" or char == "\n" or char == "\r":
|
|
||||||
return False
|
|
||||||
cat = unicodedata.category(char)
|
|
||||||
if cat.startswith("C"):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _is_punctuation(char):
|
|
||||||
"""Checks whether `chars` is a punctuation character."""
|
|
||||||
cp = ord(char)
|
|
||||||
# We treat all non-letter/number ASCII as punctuation.
|
|
||||||
# Characters such as "^", "$", and "`" are not in the Unicode
|
|
||||||
# Punctuation class but we treat them as punctuation anyways, for
|
|
||||||
# consistency.
|
|
||||||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
|
|
||||||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
|
|
||||||
return True
|
|
||||||
cat = unicodedata.category(char)
|
|
||||||
if cat.startswith("P"):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|||||||
19
tests/conftest.py
Normal file
19
tests/conftest.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# content of conftest.py
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
parser.addoption(
|
||||||
|
"--runslow", action="store_true", default=False, help="run slow tests"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_collection_modifyitems(config, items):
|
||||||
|
if config.getoption("--runslow"):
|
||||||
|
# --runslow given in cli: do not skip slow tests
|
||||||
|
return
|
||||||
|
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
|
||||||
|
for item in items:
|
||||||
|
if "slow" in item.keywords:
|
||||||
|
item.add_marker(skip_slow)
|
||||||
@@ -16,15 +16,18 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
|
from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||||
|
from pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
class GPT2ModelTest(unittest.TestCase):
|
class GPT2ModelTest(unittest.TestCase):
|
||||||
class GPT2ModelTester(object):
|
class GPT2ModelTester(object):
|
||||||
@@ -176,6 +179,22 @@ class GPT2ModelTest(unittest.TestCase):
|
|||||||
self.assertEqual(obj["vocab_size"], 99)
|
self.assertEqual(obj["vocab_size"], 99)
|
||||||
self.assertEqual(obj["n_embd"], 37)
|
self.assertEqual(obj["n_embd"], 37)
|
||||||
|
|
||||||
|
def test_config_to_json_file(self):
|
||||||
|
config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
|
||||||
|
json_file_path = "/tmp/config.json"
|
||||||
|
config_first.to_json_file(json_file_path)
|
||||||
|
config_second = GPT2Config.from_json_file(json_file_path)
|
||||||
|
os.remove(json_file_path)
|
||||||
|
self.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
|
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
def run_tester(self, tester):
|
def run_tester(self, tester):
|
||||||
config_and_inputs = tester.prepare_config_and_inputs()
|
config_and_inputs = tester.prepare_config_and_inputs()
|
||||||
output_result = tester.create_gpt2_model(*config_and_inputs)
|
output_result = tester.create_gpt2_model(*config_and_inputs)
|
||||||
|
|||||||
@@ -16,15 +16,18 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
|
from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||||
|
from pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
class OpenAIGPTModelTest(unittest.TestCase):
|
class OpenAIGPTModelTest(unittest.TestCase):
|
||||||
class OpenAIGPTModelTester(object):
|
class OpenAIGPTModelTester(object):
|
||||||
@@ -188,6 +191,22 @@ class OpenAIGPTModelTest(unittest.TestCase):
|
|||||||
self.assertEqual(obj["vocab_size"], 99)
|
self.assertEqual(obj["vocab_size"], 99)
|
||||||
self.assertEqual(obj["n_embd"], 37)
|
self.assertEqual(obj["n_embd"], 37)
|
||||||
|
|
||||||
|
def test_config_to_json_file(self):
|
||||||
|
config_first = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
|
||||||
|
json_file_path = "/tmp/config.json"
|
||||||
|
config_first.to_json_file(json_file_path)
|
||||||
|
config_second = OpenAIGPTConfig.from_json_file(json_file_path)
|
||||||
|
os.remove(json_file_path)
|
||||||
|
self.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
|
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
def run_tester(self, tester):
|
def run_tester(self, tester):
|
||||||
config_and_inputs = tester.prepare_config_and_inputs()
|
config_and_inputs = tester.prepare_config_and_inputs()
|
||||||
output_result = tester.create_openai_model(*config_and_inputs)
|
output_result = tester.create_openai_model(*config_and_inputs)
|
||||||
|
|||||||
@@ -16,9 +16,12 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -26,6 +29,7 @@ from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
|
|||||||
BertForNextSentencePrediction, BertForPreTraining,
|
BertForNextSentencePrediction, BertForPreTraining,
|
||||||
BertForQuestionAnswering, BertForSequenceClassification,
|
BertForQuestionAnswering, BertForSequenceClassification,
|
||||||
BertForTokenClassification)
|
BertForTokenClassification)
|
||||||
|
from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
class BertModelTest(unittest.TestCase):
|
class BertModelTest(unittest.TestCase):
|
||||||
@@ -251,6 +255,22 @@ class BertModelTest(unittest.TestCase):
|
|||||||
self.assertEqual(obj["vocab_size"], 99)
|
self.assertEqual(obj["vocab_size"], 99)
|
||||||
self.assertEqual(obj["hidden_size"], 37)
|
self.assertEqual(obj["hidden_size"], 37)
|
||||||
|
|
||||||
|
def test_config_to_json_file(self):
|
||||||
|
config_first = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37)
|
||||||
|
json_file_path = "/tmp/config.json"
|
||||||
|
config_first.to_json_file(json_file_path)
|
||||||
|
config_second = BertConfig.from_json_file(json_file_path)
|
||||||
|
os.remove(json_file_path)
|
||||||
|
self.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
|
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
def run_tester(self, tester):
|
def run_tester(self, tester):
|
||||||
config_and_inputs = tester.prepare_config_and_inputs()
|
config_and_inputs = tester.prepare_config_and_inputs()
|
||||||
output_result = tester.create_bert_model(*config_and_inputs)
|
output_result = tester.create_bert_model(*config_and_inputs)
|
||||||
|
|||||||
@@ -16,14 +16,17 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||||
|
from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
class TransfoXLModelTest(unittest.TestCase):
|
class TransfoXLModelTest(unittest.TestCase):
|
||||||
class TransfoXLModelTester(object):
|
class TransfoXLModelTester(object):
|
||||||
@@ -186,6 +189,22 @@ class TransfoXLModelTest(unittest.TestCase):
|
|||||||
self.assertEqual(obj["n_token"], 96)
|
self.assertEqual(obj["n_token"], 96)
|
||||||
self.assertEqual(obj["d_embed"], 37)
|
self.assertEqual(obj["d_embed"], 37)
|
||||||
|
|
||||||
|
def test_config_to_json_file(self):
|
||||||
|
config_first = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
|
||||||
|
json_file_path = "/tmp/config.json"
|
||||||
|
config_first.to_json_file(json_file_path)
|
||||||
|
config_second = TransfoXLConfig.from_json_file(json_file_path)
|
||||||
|
os.remove(json_file_path)
|
||||||
|
self.assertEqual(config_second.to_dict(), config_first.to_dict())
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
|
for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
def run_tester(self, tester):
|
def run_tester(self, tester):
|
||||||
config_and_inputs = tester.prepare_config_and_inputs()
|
config_and_inputs = tester.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
|||||||
77
tests/tokenization_gpt2_test.py
Normal file
77
tests/tokenization_gpt2_test.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
|
class GPT2TokenizationTest(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_full_tokenizer(self):
|
||||||
|
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
||||||
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||||
|
"lo", "low", "er",
|
||||||
|
"low", "lowest", "newer", "wider"]
|
||||||
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
|
||||||
|
with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
|
||||||
|
fp.write(json.dumps(vocab_tokens))
|
||||||
|
vocab_file = fp.name
|
||||||
|
with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
|
||||||
|
fp.write("\n".join(merges))
|
||||||
|
merges_file = fp.name
|
||||||
|
|
||||||
|
tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
|
||||||
|
os.remove(vocab_file)
|
||||||
|
os.remove(merges_file)
|
||||||
|
|
||||||
|
text = "lower"
|
||||||
|
bpe_tokens = ["low", "er"]
|
||||||
|
tokens = tokenizer.tokenize(text)
|
||||||
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
|
input_tokens = tokens + ["<unk>"]
|
||||||
|
input_bpe_tokens = [13, 12, 16]
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
|
vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
|
||||||
|
tokenizer_2 = GPT2Tokenizer.from_pretrained("/tmp/")
|
||||||
|
os.remove(vocab_file)
|
||||||
|
os.remove(merges_file)
|
||||||
|
os.remove(special_tokens_file)
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
[tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
|
||||||
|
tokenizer.special_tokens, tokenizer.special_tokens_decoder],
|
||||||
|
[tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
|
||||||
|
tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
|
||||||
|
|
||||||
|
# @pytest.mark.slow
|
||||||
|
def test_tokenizer_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
|
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(tokenizer)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import json
|
import json
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
|
from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
class OpenAIGPTTokenizationTest(unittest.TestCase):
|
class OpenAIGPTTokenizationTest(unittest.TestCase):
|
||||||
@@ -32,13 +34,13 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
|
|||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
|
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
|
||||||
with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
|
with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
|
||||||
json.dump(vocab_tokens, fp)
|
fp.write(json.dumps(vocab_tokens))
|
||||||
vocab_file = fp.name
|
vocab_file = fp.name
|
||||||
with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
|
with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
merges_file = fp.name
|
merges_file = fp.name
|
||||||
|
|
||||||
tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
|
tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
|
||||||
os.remove(vocab_file)
|
os.remove(vocab_file)
|
||||||
os.remove(merges_file)
|
os.remove(merges_file)
|
||||||
|
|
||||||
@@ -52,5 +54,26 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
|
|||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
|
vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
|
||||||
|
tokenizer_2 = OpenAIGPTTokenizer.from_pretrained("/tmp/")
|
||||||
|
os.remove(vocab_file)
|
||||||
|
os.remove(merges_file)
|
||||||
|
os.remove(special_tokens_file)
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
[tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
|
||||||
|
tokenizer.special_tokens, tokenizer.special_tokens_decoder],
|
||||||
|
[tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
|
||||||
|
tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_tokenizer_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
|
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(tokenizer)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -17,12 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
from io import open
|
from io import open
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
||||||
BertTokenizer,
|
BertTokenizer,
|
||||||
WordpieceTokenizer,
|
WordpieceTokenizer,
|
||||||
_is_control, _is_punctuation,
|
_is_control, _is_punctuation,
|
||||||
_is_whitespace)
|
_is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)
|
||||||
|
|
||||||
|
|
||||||
class TokenizationTest(unittest.TestCase):
|
class TokenizationTest(unittest.TestCase):
|
||||||
@@ -46,6 +48,24 @@ class TokenizationTest(unittest.TestCase):
|
|||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
||||||
|
|
||||||
|
vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
|
||||||
|
tokenizer.from_pretrained(vocab_file)
|
||||||
|
os.remove(vocab_file)
|
||||||
|
|
||||||
|
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
|
||||||
|
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_tokenizer_from_pretrained(self):
|
||||||
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
|
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(tokenizer)
|
||||||
|
|
||||||
def test_chinese(self):
|
def test_chinese(self):
|
||||||
tokenizer = BasicTokenizer()
|
tokenizer = BasicTokenizer()
|
||||||
|
|
||||||
|
|||||||
@@ -17,10 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
from io import open
|
from io import open
|
||||||
|
import shutil
|
||||||
|
import pytest
|
||||||
|
|
||||||
from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer,
|
from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
|
||||||
_is_control, _is_punctuation,
|
|
||||||
_is_whitespace)
|
|
||||||
|
|
||||||
|
|
||||||
class TransfoXLTokenizationTest(unittest.TestCase):
|
class TransfoXLTokenizationTest(unittest.TestCase):
|
||||||
@@ -37,54 +37,44 @@ class TransfoXLTokenizationTest(unittest.TestCase):
|
|||||||
tokenizer.build_vocab()
|
tokenizer.build_vocab()
|
||||||
os.remove(vocab_file)
|
os.remove(vocab_file)
|
||||||
|
|
||||||
tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
|
tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
|
||||||
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
|
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
|
||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
|
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
|
||||||
|
|
||||||
|
vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
|
||||||
|
tokenizer.from_pretrained(vocab_file)
|
||||||
|
os.remove(vocab_file)
|
||||||
|
|
||||||
|
tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
|
||||||
|
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
|
||||||
|
|
||||||
|
|
||||||
def test_full_tokenizer_lower(self):
|
def test_full_tokenizer_lower(self):
|
||||||
tokenizer = TransfoXLTokenizer(lower_case=True)
|
tokenizer = TransfoXLTokenizer(lower_case=True)
|
||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
|
tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "),
|
||||||
["hello", "!", "how", "are", "you", "?"])
|
["hello", "!", "how", "are", "you", "?"])
|
||||||
self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
|
|
||||||
|
|
||||||
def test_full_tokenizer_no_lower(self):
|
def test_full_tokenizer_no_lower(self):
|
||||||
tokenizer = TransfoXLTokenizer(lower_case=False)
|
tokenizer = TransfoXLTokenizer(lower_case=False)
|
||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
|
tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "),
|
||||||
["HeLLo", "!", "how", "Are", "yoU", "?"])
|
["HeLLo", "!", "how", "Are", "yoU", "?"])
|
||||||
|
|
||||||
def test_is_whitespace(self):
|
@pytest.mark.slow
|
||||||
self.assertTrue(_is_whitespace(u" "))
|
def test_tokenizer_from_pretrained(self):
|
||||||
self.assertTrue(_is_whitespace(u"\t"))
|
cache_dir = "/tmp/pytorch_pretrained_bert_test/"
|
||||||
self.assertTrue(_is_whitespace(u"\r"))
|
for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
|
||||||
self.assertTrue(_is_whitespace(u"\n"))
|
tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
||||||
self.assertTrue(_is_whitespace(u"\u00A0"))
|
shutil.rmtree(cache_dir)
|
||||||
|
self.assertIsNotNone(tokenizer)
|
||||||
self.assertFalse(_is_whitespace(u"A"))
|
|
||||||
self.assertFalse(_is_whitespace(u"-"))
|
|
||||||
|
|
||||||
def test_is_control(self):
|
|
||||||
self.assertTrue(_is_control(u"\u0005"))
|
|
||||||
|
|
||||||
self.assertFalse(_is_control(u"A"))
|
|
||||||
self.assertFalse(_is_control(u" "))
|
|
||||||
self.assertFalse(_is_control(u"\t"))
|
|
||||||
self.assertFalse(_is_control(u"\r"))
|
|
||||||
|
|
||||||
def test_is_punctuation(self):
|
|
||||||
self.assertTrue(_is_punctuation(u"-"))
|
|
||||||
self.assertTrue(_is_punctuation(u"$"))
|
|
||||||
self.assertTrue(_is_punctuation(u"`"))
|
|
||||||
self.assertTrue(_is_punctuation(u"."))
|
|
||||||
|
|
||||||
self.assertFalse(_is_punctuation(u"A"))
|
|
||||||
self.assertFalse(_is_punctuation(u" "))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user