From 1849aa7d39c78a6ad7e33fb9d4800582a53c43b1 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 16 Jul 2019 15:11:29 +0200 Subject: [PATCH] update readme and pretrained model weight files --- README.md | 43 +++++++++++-------- examples/requirements.txt | 2 + .../tests/tokenization_tests_commons.py | 12 ++++++ .../tokenization_transfo_xl.py | 1 - pytorch_transformers/tokenization_utils.py | 13 ++++++ pytorch_transformers/tokenization_xlm.py | 7 +++ 6 files changed, 60 insertions(+), 18 deletions(-) create mode 100644 examples/requirements.txt diff --git a/README.md b/README.md index b3685fa357..afb351cdd8 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,7 @@ with torch.no_grad(): # See the models docstrings for the detail of all the outputs # In our case, the first element is the hidden state of the last layer of the Bert model encoded_layers = outputs[0] + # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension) assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size) ``` @@ -218,22 +219,30 @@ Before running anyone of these GLUE tasks you should download the [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to some directory `$GLUE_DIR`. +You should also install the additional packages required by the examples: + +```shell +pip install -r ./examples/requirements.txt +``` + ```shell export GLUE_DIR=/path/to/glue export TASK_NAME=MRPC -python run_bert_classifier.py \ - --task_name $TASK_NAME \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir $GLUE_DIR/$TASK_NAME \ - --bert_model bert-base-uncased \ - --max_seq_length 128 \ - --train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 3.0 \ - --output_dir /tmp/$TASK_NAME/ +python ./examples/run_glue.py \ + --model_type bert \ + --model_name_or_path bert-base-uncased \ + --task_name $TASK_NAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/$TASK_NAME \ + --max_seq_length 128 \ + --per_gpu_eval_batch_size=8 \ + --per_gpu_train_batch_size=8 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir /tmp/$TASK_NAME/ ``` where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI. @@ -243,7 +252,7 @@ The dev set results will be present within the text file 'eval_results.txt' in t #### Fine-tuning XLNet model on the STS-B regression task This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs. -Parallel training is a simple way to use several GPU (but it is slower and less flexible than distributed training, see below). +Parallel training is a simple way to use several GPUs (but is slower and less flexible than distributed training, see below). ```shell export GLUE_DIR=/path/to/glue @@ -252,6 +261,7 @@ python ./examples/run_glue.py \ --model_type xlnet \ --model_name_or_path xlnet-large-cased \ --do_train \ + --do_eval \ --task_name=sts-b \ --data_dir=${GLUE_DIR}/STS-B \ --output_dir=./proc_data/sts-b-110 \ @@ -266,15 +276,14 @@ python ./examples/run_glue.py \ --warmup_steps=120 ``` -On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. -These hyper-parameters give evaluation results pearsonr of `0.918`. +On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should results in a Pearson correlation coefficient of `+0.917` on the development set. #### Fine-tuning Bert model on the MRPC classification task This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92. ```bash -python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \ +python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py \ --model_type bert \ --model_name_or_path bert-large-uncased-whole-word-masking \ --task_name MRPC \ @@ -308,7 +317,7 @@ Training with these hyper-parameters gave us the following results: This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD: ```bash -python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \ +python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \ --model_type bert \ --model_name_or_path bert-large-uncased-whole-word-masking \ --do_train \ diff --git a/examples/requirements.txt b/examples/requirements.txt new file mode 100644 index 0000000000..42abe8933c --- /dev/null +++ b/examples/requirements.txt @@ -0,0 +1,2 @@ +tensorboardX +scikit-learn \ No newline at end of file diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py index e33ba3cb06..c37770b229 100644 --- a/pytorch_transformers/tests/tokenization_tests_commons.py +++ b/pytorch_transformers/tests/tokenization_tests_commons.py @@ -129,7 +129,19 @@ def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tester.assertNotEqual(len(tokens_2), 0) tester.assertIsInstance(text_2, (str, unicode)) + +def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs): + weights_list = list(tokenizer_class.max_model_input_sizes.keys()) + weights_lists_2 = [] + for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items(): + weights_lists_2.append(list(map_list.keys())) + + for weights_list_2 in weights_lists_2: + tester.assertListEqual(weights_list, weights_list_2) + + def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs): + create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs) create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs) create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs) create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs) diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py index 9406d48c7b..237f8ea387 100644 --- a/pytorch_transformers/tokenization_transfo_xl.py +++ b/pytorch_transformers/tokenization_transfo_xl.py @@ -138,7 +138,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer): def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary to a directory or file.""" - index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file']) torch.save(self.__dict__, vocab_file) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index df18f5e536..f603a29d74 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -163,6 +163,11 @@ class PreTrainedTokenizer(object): for file_id, map_list in cls.pretrained_vocab_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] else: + logger.info( + "Model name '{}' not found in model shortcut name list ({}). " + "Assuming '{}' is a path or url to a directory containing tokenizer files.".format( + pretrained_model_name_or_path, ', '.join(s3_models), + pretrained_model_name_or_path)) all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} all_vocab_files_names.update(cls.vocab_files_names) @@ -175,6 +180,14 @@ class PreTrainedTokenizer(object): logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) full_file_name = None vocab_files[file_id] = full_file_name + if all(full_file_name is None for full_file_name in vocab_files.values()): + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find tokenizer files" + "at this path or url.".format( + pretrained_model_name_or_path, ', '.join(s3_models), + pretrained_model_name_or_path, )) + return None # Get files from url, cache, or disk depending on the case try: diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index 42b61badcd..899f6b884f 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -59,6 +59,13 @@ PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'xlm-mlm-en-2048': 512, + 'xlm-mlm-ende-1024': 512, + 'xlm-mlm-enfr-1024': 512, + 'xlm-mlm-enro-1024': 512, + 'xlm-mlm-tlm-xnli15-1024': 512, + 'xlm-mlm-xnli15-1024': 512, + 'xlm-clm-enfr-1024': 512, + 'xlm-clm-ende-1024': 512, } def get_pairs(word):