From c4e9615691a19128f446563718355aedf03cf01b Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 17 Jul 2019 09:08:40 -0700 Subject: [PATCH 01/36] Fix a path so that test can run on Windows --- pytorch_transformers/tests/modeling_common_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py index 5ea98d68e2..e974ae865d 100644 --- a/pytorch_transformers/tests/modeling_common_test.py +++ b/pytorch_transformers/tests/modeling_common_test.py @@ -21,6 +21,7 @@ import os import shutil import json import random +import uuid import unittest import logging @@ -527,7 +528,7 @@ class ConfigTester(object): def create_and_test_config_to_json_file(self): config_first = self.config_class(**self.inputs_dict) - json_file_path = "/tmp/config.json" + json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json") config_first.to_json_file(json_file_path) config_second = self.config_class.from_json_file(json_file_path) os.remove(json_file_path) From ba4bce2581f9a67caa44c3cc959a2dacb0090670 Mon Sep 17 00:00:00 2001 From: tuvuumass Date: Tue, 13 Aug 2019 11:26:27 -0400 Subject: [PATCH 02/36] fix issue #824 --- examples/run_bertology.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/run_bertology.py b/examples/run_bertology.py index 61c7440ecb..f11b73b54f 100644 --- a/examples/run_bertology.py +++ b/examples/run_bertology.py @@ -211,10 +211,12 @@ def prune_heads(args, model, eval_dataloader, head_mask): def main(): parser = argparse.ArgumentParser() + ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--model_name", default=None, type=str, required=True, - help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS)) + parser.add_argument("--model_name_or_path", default=None, type=str, required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join( + ALL_MODELS)) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument("--output_dir", default=None, type=str, required=True, @@ -222,9 +224,9 @@ def main(): ## Other parameters parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") + help="Pretrained config name or path if not the same as model_name_or_path") parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") + help="Pretrained tokenizer name or path if not the same as model_name_or_path") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--data_subset", type=int, default=-1, @@ -297,15 +299,15 @@ def main(): args.model_type = "" for key in MODEL_CLASSES: - if key in args.model_name.lower(): + if key in args.model_name_or_path.lower(): args.model_type = key # take the first match in model types break config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name, + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, output_attentions=True) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name) - model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path) + model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab From 9ce36e3e4b0b17dd6df05e13e563570677cda39e Mon Sep 17 00:00:00 2001 From: samvelyan Date: Wed, 14 Aug 2019 08:57:09 +0000 Subject: [PATCH 03/36] Re-implemented tokenize() iteratively in PreTrainedTokenizer. --- pytorch_transformers/tokenization_utils.py | 42 ++++++++++++++++++---- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 2e75c83bfb..bdeeeb4877 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -428,7 +428,7 @@ class PreTrainedTokenizer(object): Parameters: special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``]. - + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). Returns: @@ -472,15 +472,45 @@ class PreTrainedTokenizer(object): Take care of added tokens. """ + def split_on_token(tok, text): + result = [] + split_text = text.split(tok) + for i, sub_text in enumerate(split_text): + sub_text = sub_text.strip() + if i == 0 and not sub_text: + result += [tok] + elif i == len(split_text) - 1: + if sub_text: + result += [sub_text] + else: + pass + else: + if sub_text: + result += [sub_text] + result += [tok] + return result + def split_on_tokens(tok_list, text): if not text: return [] if not tok_list: return self._tokenize(text, **kwargs) - tok = tok_list[0] - split_text = text.split(tok) - return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \ - for sub_text in split_text), [])[:-1] + + tokenized_text = [] + text_list = [text] + for tok in tok_list: + tokenized_text = [] + for sub_text in text_list: + if sub_text not in self.added_tokens_encoder \ + and sub_text not in self.all_special_tokens: + tokenized_text += split_on_token(tok, sub_text) + else: + tokenized_text += [sub_text] + text_list = tokenized_text + + return sum((self._tokenize(token, **kwargs) if token not \ + in self.added_tokens_encoder and token not in self.all_special_tokens \ + else [token] for token in tokenized_text), []) added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens tokenized_text = split_on_tokens(added_tokens, text) @@ -522,7 +552,7 @@ class PreTrainedTokenizer(object): def encode(self, text): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. - + Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``. """ return self.convert_tokens_to_ids(self.tokenize(text)) From b8ff56896ccbd27a54035a90a3bc278a44541a74 Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Fri, 16 Aug 2019 12:11:05 +0800 Subject: [PATCH 04/36] Fix bug of multi-gpu training in lm finetuning --- examples/lm_finetuning/finetune_on_pregenerated.py | 2 +- examples/lm_finetuning/simple_lm_finetuning.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 9fcc5f2cb1..7c40342f18 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -320,7 +320,7 @@ def main(): global_step += 1 # Save a trained model - if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 : + if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index ba5f832827..25333de0ed 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -507,7 +507,7 @@ def main(): if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) - if not os.path.exists(args.output_dir) and ( n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 ): + if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) @@ -608,7 +608,7 @@ def main(): global_step += 1 # Save a trained model - if args.do_train and ( n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1): + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) From 7e7fc53da5f230db379ece739457c81b2f50f13e Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Fri, 16 Aug 2019 11:02:10 -0400 Subject: [PATCH 05/36] Fixing run_glue example with RoBERTa --- examples/run_glue.py | 2 +- examples/utils_glue.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index c0f70e0863..7fb0732e61 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -279,7 +279,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token=tokenizer.encoder[tokenizer.pad_token] if args.model_type in ['roberta'] else tokenizer.vocab[tokenizer.pad_token], + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: diff --git a/examples/utils_glue.py b/examples/utils_glue.py index c955e4d0ce..e1649fa5af 100644 --- a/examples/utils_glue.py +++ b/examples/utils_glue.py @@ -425,9 +425,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[:(max_seq_length - 2)] + # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. + special_tokens_count = 3 if sep_token_extra else 2 + if len(tokens_a) > max_seq_length - special_tokens_count: + tokens_a = tokens_a[:(max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: From d8923270e6c497862f990a3c72e40cc1ddd01d4e Mon Sep 17 00:00:00 2001 From: Jason Phang Date: Fri, 16 Aug 2019 15:58:19 -0400 Subject: [PATCH 06/36] Correct truncation for RoBERTa in 2-input GLUE --- examples/utils_glue.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/utils_glue.py b/examples/utils_glue.py index e1649fa5af..3e3f104672 100644 --- a/examples/utils_glue.py +++ b/examples/utils_glue.py @@ -422,8 +422,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokens_b = tokenizer.tokenize(example.text_b) # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa. + special_tokens_count = 4 if sep_token_extra else 3 + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) else: # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 From 189ff9b66408a1758f3732725db3871322f3e0e6 Mon Sep 17 00:00:00 2001 From: Christophe Bourguignat Date: Sat, 17 Aug 2019 18:46:50 +0200 Subject: [PATCH 07/36] Update README after RoBERTa addition --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3389e10593..7d2445fc11 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ import torch from pytorch_transformers import * # PyTorch-Transformers has a unified API -# for 6 transformer architectures and 27 pretrained weights. +# for 7 transformer architectures and 30 pretrained weights. # Model | Tokenizer | Pretrained weights shortcut MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'), (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'), From 00e9c4cc9616cab1666cab0a331b5d7e68946928 Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Sun, 18 Aug 2019 11:02:02 +0800 Subject: [PATCH 08/36] Fix: save model/model.module --- examples/lm_finetuning/finetune_on_pregenerated.py | 11 ++++++----- examples/lm_finetuning/simple_lm_finetuning.py | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 7c40342f18..1177d84cd4 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -155,12 +155,12 @@ def main(): help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") - parser.add_argument("--warmup_steps", - default=0, + parser.add_argument("--warmup_steps", + default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument("--adam_epsilon", - default=1e-8, + parser.add_argument("--adam_epsilon", + default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", @@ -322,7 +322,8 @@ def main(): # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index 25333de0ed..9633640faf 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -610,7 +610,8 @@ def main(): # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) From 1ef41b83374ce5756e24746201d21432d7ecada0 Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Sun, 18 Aug 2019 11:03:12 +0800 Subject: [PATCH 09/36] Revert "Fix: save model/model.module" This reverts commit 00e9c4cc9616cab1666cab0a331b5d7e68946928. --- examples/lm_finetuning/finetune_on_pregenerated.py | 11 +++++------ examples/lm_finetuning/simple_lm_finetuning.py | 3 +-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 1177d84cd4..7c40342f18 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -155,12 +155,12 @@ def main(): help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") - parser.add_argument("--warmup_steps", - default=0, + parser.add_argument("--warmup_steps", + default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument("--adam_epsilon", - default=1e-8, + parser.add_argument("--adam_epsilon", + default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", @@ -322,8 +322,7 @@ def main(): # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) + model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index 9633640faf..25333de0ed 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -610,8 +610,7 @@ def main(): # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) + model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) From 856a63da4d1f0f302633dc73e2d4a1f698bbafda Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Sun, 18 Aug 2019 11:03:47 +0800 Subject: [PATCH 10/36] Fix: save model/model.module --- examples/lm_finetuning/finetune_on_pregenerated.py | 3 ++- examples/lm_finetuning/simple_lm_finetuning.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 7c40342f18..eefa56c824 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -322,7 +322,8 @@ def main(): # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index 25333de0ed..9633640faf 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -610,7 +610,8 @@ def main(): # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) From 40acf6b52a5250608c2b90edd955835131971d5a Mon Sep 17 00:00:00 2001 From: Chi-Liang Liu Date: Tue, 30 Jul 2019 18:37:37 +0800 Subject: [PATCH 11/36] don't save model without training --- examples/run_squad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index f0ae9169ad..f2d29fd6b1 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -481,7 +481,7 @@ def main(): # Save the trained model and the tokenizer - if args.local_rank == -1 or torch.distributed.get_rank() == 0: + if args.do_train and args.local_rank == -1 or torch.distributed.get_rank() == 0: # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) From c589862b783b94a8408b40c6dc9bf4a14b2ee391 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 19 Aug 2019 10:17:47 -0400 Subject: [PATCH 12/36] Doc: loading from config alone does not load the model weights --- pytorch_transformers/modeling_bert.py | 4 +++- pytorch_transformers/modeling_gpt2.py | 2 ++ pytorch_transformers/modeling_openai.py | 2 ++ pytorch_transformers/modeling_roberta.py | 3 ++- pytorch_transformers/modeling_transfo_xl.py | 2 ++ pytorch_transformers/modeling_utils.py | 4 ++++ pytorch_transformers/modeling_xlm.py | 2 ++ pytorch_transformers/modeling_xlnet.py | 2 ++ 8 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py index 51d8788545..9c20eac9bf 100644 --- a/pytorch_transformers/modeling_bert.py +++ b/pytorch_transformers/modeling_bert.py @@ -577,7 +577,9 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in https://pytorch.org/docs/stable/nn.html#module Parameters: - config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. + config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ BERT_INPUTS_DOCSTRING = r""" diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index 5211def3e3..f67d0e88d5 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -383,6 +383,8 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in Parameters: config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ GPT2_INPUTS_DOCSTRING = r""" Inputs: diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index 364923b0af..e8648487be 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -397,6 +397,8 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in Parameters: config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py index adb04b4b3a..e3065cf60b 100644 --- a/pytorch_transformers/modeling_roberta.py +++ b/pytorch_transformers/modeling_roberta.py @@ -90,7 +90,8 @@ ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in Parameters: config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the - model. + model. Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ ROBERTA_INPUTS_DOCSTRING = r""" diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py index cb5416964c..553a71fffe 100644 --- a/pytorch_transformers/modeling_transfo_xl.py +++ b/pytorch_transformers/modeling_transfo_xl.py @@ -928,6 +928,8 @@ TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in Parameters: config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ TRANSFO_XL_INPUTS_DOCSTRING = r""" diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index 35f82e324f..edc6b3903e 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -71,6 +71,10 @@ class PretrainedConfig(object): r""" Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. + Note: + A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. + It only affects the model's configuration. + Class attributes (overridden by derived classes): - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values. diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py index 941c8dda2f..d01d245bbb 100644 --- a/pytorch_transformers/modeling_xlm.py +++ b/pytorch_transformers/modeling_xlm.py @@ -416,6 +416,8 @@ XLM_START_DOCSTRING = r""" The XLM model was proposed in Parameters: config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ XLM_INPUTS_DOCSTRING = r""" diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py index e9e75e3ab7..af33c5a6c2 100644 --- a/pytorch_transformers/modeling_xlnet.py +++ b/pytorch_transformers/modeling_xlnet.py @@ -647,6 +647,8 @@ XLNET_START_DOCSTRING = r""" The XLNet model was proposed in Parameters: config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ XLNET_INPUTS_DOCSTRING = r""" From a368b877911862da014ed7b219679effbb8dd8ca Mon Sep 17 00:00:00 2001 From: Peng Qi Date: Mon, 19 Aug 2019 13:07:00 -0700 Subject: [PATCH 13/36] Fix #1015 --- examples/run_squad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index f2d29fd6b1..efa835107c 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -498,7 +498,7 @@ def main(): # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) - tokenizer = tokenizer_class.from_pretrained(args.output_dir) + tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) From 28f7ca1f807f0857c24f18c0b28b6b8ebee18c0a Mon Sep 17 00:00:00 2001 From: Zeyao Du Date: Tue, 20 Aug 2019 15:58:42 +0800 Subject: [PATCH 14/36] swap optimizer.step and scheduler.step --- examples/lm_finetuning/simple_lm_finetuning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index ba5f832827..dca883d2f6 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -602,8 +602,8 @@ def main(): nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: - scheduler.step() # Update learning rate schedule optimizer.step() + scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 From a1359b970cb4bfa41008a45b44dd2a25e579bff3 Mon Sep 17 00:00:00 2001 From: Zeyao Du Date: Tue, 20 Aug 2019 16:00:07 +0800 Subject: [PATCH 15/36] Update finetune_on_pregenerated.py --- examples/lm_finetuning/finetune_on_pregenerated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 9fcc5f2cb1..ccf1c15313 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -314,8 +314,8 @@ def main(): mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: - scheduler.step() # Update learning rate schedule optimizer.step() + scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 From 45ab8bf60e5c2af912006035f5568be92c0c99c9 Mon Sep 17 00:00:00 2001 From: Duzeyao <330501241@qq.com> Date: Tue, 20 Aug 2019 16:40:39 +0800 Subject: [PATCH 16/36] Revert "Update finetune_on_pregenerated.py" This reverts commit a1359b970cb4bfa41008a45b44dd2a25e579bff3. --- examples/lm_finetuning/finetune_on_pregenerated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index ccf1c15313..9fcc5f2cb1 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -314,8 +314,8 @@ def main(): mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: - optimizer.step() scheduler.step() # Update learning rate schedule + optimizer.step() optimizer.zero_grad() global_step += 1 From d86b49ac86141810af4a7c82ed34e789b3b1937e Mon Sep 17 00:00:00 2001 From: Duzeyao <330501241@qq.com> Date: Tue, 20 Aug 2019 16:46:34 +0800 Subject: [PATCH 17/36] swap optimizer.step and scheduler.step --- examples/lm_finetuning/finetune_on_pregenerated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 9fcc5f2cb1..ccf1c15313 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -314,8 +314,8 @@ def main(): mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: - scheduler.step() # Update learning rate schedule optimizer.step() + scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 From fecaed0ed4bf338bca5b9895107b309841f8ac57 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 20 Aug 2019 10:56:12 +0200 Subject: [PATCH 18/36] add force_download option to from_pretrained methods --- pytorch_transformers/file_utils.py | 13 ++++++++----- pytorch_transformers/modeling_utils.py | 13 +++++++++++-- pytorch_transformers/tokenization_utils.py | 6 +++++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py index 75c075720c..074e6743ef 100644 --- a/pytorch_transformers/file_utils.py +++ b/pytorch_transformers/file_utils.py @@ -93,12 +93,15 @@ def filename_to_url(filename, cache_dir=None): return url, etag -def cached_path(url_or_filename, cache_dir=None): +def cached_path(url_or_filename, cache_dir=None, force_download=False): """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path. + Args: + cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). + force_download: if True, re-dowload the file even if it's already cached in the cache dir. """ if cache_dir is None: cache_dir = PYTORCH_TRANSFORMERS_CACHE @@ -111,7 +114,7 @@ def cached_path(url_or_filename, cache_dir=None): if parsed.scheme in ('http', 'https', 's3'): # URL, so get it from the cache (downloading if necessary) - return get_from_cache(url_or_filename, cache_dir) + return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download) elif os.path.exists(url_or_filename): # File, and it exists. return url_or_filename @@ -184,7 +187,7 @@ def http_get(url, temp_file): progress.close() -def get_from_cache(url, cache_dir=None): +def get_from_cache(url, cache_dir=None, force_download=False): """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. @@ -227,11 +230,11 @@ def get_from_cache(url, cache_dir=None): if matching_files: cache_path = os.path.join(cache_dir, matching_files[-1]) - if not os.path.exists(cache_path): + if not os.path.exists(cache_path) or force_download: # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with tempfile.NamedTemporaryFile() as temp_file: - logger.info("%s not found in cache, downloading to %s", url, temp_file.name) + logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) # GET file object if url.startswith("s3://"): diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index edc6b3903e..3e4fbca132 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -125,6 +125,9 @@ class PretrainedConfig(object): - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + return_unused_kwargs: (`optional`) bool: - If False, then this function returns just the final configuration object. @@ -146,6 +149,7 @@ class PretrainedConfig(object): """ cache_dir = kwargs.pop('cache_dir', None) + force_download = kwargs.pop('force_download', False) return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) if pretrained_model_name_or_path in cls.pretrained_config_archive_map: @@ -156,7 +160,7 @@ class PretrainedConfig(object): config_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: - resolved_config_file = cached_path(config_file, cache_dir=cache_dir) + resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download) except EnvironmentError: if pretrained_model_name_or_path in cls.pretrained_config_archive_map: logger.error( @@ -400,6 +404,9 @@ class PreTrainedModel(nn.Module): Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. @@ -424,6 +431,7 @@ class PreTrainedModel(nn.Module): state_dict = kwargs.pop('state_dict', None) cache_dir = kwargs.pop('cache_dir', None) from_tf = kwargs.pop('from_tf', False) + force_download = kwargs.pop('force_download', False) output_loading_info = kwargs.pop('output_loading_info', False) # Load config @@ -431,6 +439,7 @@ class PreTrainedModel(nn.Module): config, model_kwargs = cls.config_class.from_pretrained( pretrained_model_name_or_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, + force_download=force_download, **kwargs ) else: @@ -453,7 +462,7 @@ class PreTrainedModel(nn.Module): archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download) except EnvironmentError: if pretrained_model_name_or_path in cls.pretrained_model_archive_map: logger.error( diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 74d50b385d..763c0cee04 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -193,6 +193,9 @@ class PreTrainedTokenizer(object): cache_dir: (`optional`) string: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. + force_download: (`optional`) boolean, default False: + Force to (re-)download the vocabulary files and override the cached versions if they exists. + inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details. @@ -223,6 +226,7 @@ class PreTrainedTokenizer(object): @classmethod def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): cache_dir = kwargs.pop('cache_dir', None) + force_download = kwargs.pop('force_download', False) s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} @@ -283,7 +287,7 @@ class PreTrainedTokenizer(object): if file_path is None: resolved_vocab_files[file_id] = None else: - resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir) + resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download) except EnvironmentError: if pretrained_model_name_or_path in s3_models: logger.error("Couldn't reach server to download vocabulary.") From e239a4a20fbb901e60ffcafc06bfefcbb67eaa65 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 20 Aug 2019 11:02:00 +0200 Subject: [PATCH 19/36] close #984 --- docs/source/pretrained_models.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 987882d12e..6a14e3dcd1 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -72,16 +72,16 @@ Here is the full list of the currently provided pretrained models together with | | ``xlnet-large-cased`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. | | | | | XLNet Large English model | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| XLM | ``xlm-mlm-en-2048`` | | 12-layer, 1024-hidden, 8-heads | +| XLM | ``xlm-mlm-en-2048`` | | 12-layer, 2048-hidden, 16-heads | | | | | XLM English model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-mlm-ende-1024`` | | 12-layer, 1024-hidden, 8-heads | +| | ``xlm-mlm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-German Multi-language model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-mlm-enfr-1024`` | | 12-layer, 1024-hidden, 8-heads | +| | ``xlm-mlm-enfr-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-French Multi-language model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-mlm-enro-1024`` | | 12-layer, 1024-hidden, 8-heads | +| | ``xlm-mlm-enro-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-Romanian Multi-language model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``xlm-mlm-xnli15-1024`` | | 12-layer, 1024-hidden, 8-heads | @@ -93,7 +93,7 @@ Here is the full list of the currently provided pretrained models together with | | ``xlm-clm-enfr-1024`` | | 12-layer, 1024-hidden, 8-heads | | | | | XLM English model trained with CLM (Causal Language Modeling) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-clm-ende-1024`` | | 12-layer, 1024-hidden, 8-heads | +| | ``xlm-clm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | RoBERTa | ``roberta-base`` | | 12-layer, 768-hidden, 12-heads, 125M parameters | From 901dde0e4583a00dc7e486aca6cda7acb647dea9 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 20 Aug 2019 11:05:51 +0200 Subject: [PATCH 20/36] fix #1014 --- pytorch_transformers/tokenization_bert.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index 177d26dec1..04f35aa466 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -187,6 +187,8 @@ class BertTokenizer(PreTrainedTokenizer): index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) + else: + vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: From 53c8f700f4704a58f4684674ced1c57d6ca9240c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 20 Aug 2019 11:29:26 +0200 Subject: [PATCH 21/36] fix #808 --- pytorch_transformers/modeling_bert.py | 5 ++++- pytorch_transformers/modeling_gpt2.py | 2 ++ pytorch_transformers/modeling_openai.py | 2 ++ pytorch_transformers/modeling_roberta.py | 4 ++++ pytorch_transformers/modeling_transfo_xl.py | 2 ++ pytorch_transformers/modeling_xlm.py | 4 ++++ pytorch_transformers/modeling_xlnet.py | 2 ++ 7 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py index 9c20eac9bf..7b34b3fd90 100644 --- a/pytorch_transformers/modeling_bert.py +++ b/pytorch_transformers/modeling_bert.py @@ -599,7 +599,10 @@ BERT_INPUTS_DOCSTRING = r""" ``tokens: [CLS] the dog is hairy . [SEP]`` ``token_type_ids: 0 0 0 0 0 0 0`` - + + Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`. See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index f67d0e88d5..91d01d0584 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -390,6 +390,8 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in GPT2_INPUTS_DOCSTRING = r""" Inputs: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. + GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`. See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index e8648487be..71ffb78e0f 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -404,6 +404,8 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. + GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`. See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py index e3065cf60b..e49b2a06b1 100644 --- a/pytorch_transformers/modeling_roberta.py +++ b/pytorch_transformers/modeling_roberta.py @@ -110,6 +110,10 @@ ROBERTA_INPUTS_DOCSTRING = r""" Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with the ``add_special_tokens`` parameter set to ``True``. + + RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py index 553a71fffe..3cfdee38cb 100644 --- a/pytorch_transformers/modeling_transfo_xl.py +++ b/pytorch_transformers/modeling_transfo_xl.py @@ -936,6 +936,8 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" Inputs: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. + Transformer-XL is a model with relative position embeddings so you can either pad the inputs on + the right or on the left. Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`. See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py index d01d245bbb..be2767ed0c 100644 --- a/pytorch_transformers/modeling_xlm.py +++ b/pytorch_transformers/modeling_xlm.py @@ -424,6 +424,10 @@ XLM_INPUTS_DOCSTRING = r""" Inputs: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. + + XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`. See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py index af33c5a6c2..d44821788e 100644 --- a/pytorch_transformers/modeling_xlnet.py +++ b/pytorch_transformers/modeling_xlnet.py @@ -655,6 +655,8 @@ XLNET_INPUTS_DOCSTRING = r""" Inputs: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. + XLNet is a model with relative position embeddings so you can either pad the inputs on + the right or on the left. Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`. See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. From b0b9b8091b73f929306704bd8cd62b712621cebc Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 20 Aug 2019 11:33:46 +0200 Subject: [PATCH 22/36] minor typo --- pytorch_transformers/modeling_gpt2.py | 2 +- pytorch_transformers/modeling_openai.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index f67d0e88d5..dd3e465bf3 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -614,7 +614,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): @add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, -the classification head takes as input the input of a specified classification token index in the intput sequence). +the classification head takes as input the input of a specified classification token index in the input sequence). """, GPT2_START_DOCSTRING) class GPT2DoubleHeadsModel(GPT2PreTrainedModel): r""" Inputs: diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index e8648487be..a4f02111e7 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -604,7 +604,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): @add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, -the classification head takes as input the input of a specified classification token index in the intput sequence). +the classification head takes as input the input of a specified classification token index in the input sequence). """, OPENAI_GPT_START_DOCSTRING) class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): r""" Inputs: From 6d0aa73981f15618cf8d01255b07194e946c3286 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 20 Aug 2019 12:20:21 +0200 Subject: [PATCH 23/36] fix #1034 --- pytorch_transformers/modeling_xlm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py index be2767ed0c..19800da2ed 100644 --- a/pytorch_transformers/modeling_xlm.py +++ b/pytorch_transformers/modeling_xlm.py @@ -440,8 +440,10 @@ XLM_INPUTS_DOCSTRING = r""" Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: A parallel sequence of tokens to be used to indicate the language of each token in the input. - Indices are selected in the pre-trained language vocabulary, - i.e. in the range ``[0, config.n_langs - 1[``. + Indices are languages ids which can be obtained from the language names by using two conversion mappings + provided in the configuration of the model (only provided for multilingual models). + More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and + the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: From bfd75056b0a080addafb7f3d7c9336d27b883a0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:06:17 +0200 Subject: [PATCH 24/36] Update tokenization_xlm.py --- pytorch_transformers/tokenization_xlm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index b690a3a945..8e7c2954f2 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -124,8 +124,9 @@ class XLMTokenizer(PreTrainedTokenizer): **kwargs) try: import ftfy - import spacy - self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) + from spacy.lang.en import English + _nlp = English() + self.nlp = nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") From bb04446285be43059050406b3bc4938807c63c25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:07:40 +0200 Subject: [PATCH 25/36] Update tokenization_openai.py --- pytorch_transformers/tokenization_openai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py index 0eb5281d39..0f6a8f1dae 100644 --- a/pytorch_transformers/tokenization_openai.py +++ b/pytorch_transformers/tokenization_openai.py @@ -89,9 +89,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): try: import ftfy - import spacy - self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) - self.fix_text = ftfy.fix_text + from spacy.lang.en import English + _nlp = English() + self.nlp = nlp.Defaults.create_tokenizer(_nlp) except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) From 562b998366c7a4a2bd0addf1a860fbee0aa04d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:10:19 +0200 Subject: [PATCH 26/36] Update tokenization_openai.py --- pytorch_transformers/tokenization_openai.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py index 0f6a8f1dae..79eb023a8d 100644 --- a/pytorch_transformers/tokenization_openai.py +++ b/pytorch_transformers/tokenization_openai.py @@ -92,6 +92,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): from spacy.lang.en import English _nlp = English() self.nlp = nlp.Defaults.create_tokenizer(_nlp) + self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) From f5e2ed0fd89d5730126d71c03324fa07ae674ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:19:25 +0200 Subject: [PATCH 27/36] Update tokenization_openai.py --- pytorch_transformers/tokenization_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py index 79eb023a8d..51b418ebd3 100644 --- a/pytorch_transformers/tokenization_openai.py +++ b/pytorch_transformers/tokenization_openai.py @@ -91,7 +91,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): import ftfy from spacy.lang.en import English _nlp = English() - self.nlp = nlp.Defaults.create_tokenizer(_nlp) + self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") From 388e3251fa95b892949968dc89065e464a93b69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:19:39 +0200 Subject: [PATCH 28/36] Update tokenization_xlm.py --- pytorch_transformers/tokenization_xlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index 8e7c2954f2..2d2f3a8cd4 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -126,7 +126,7 @@ class XLMTokenizer(PreTrainedTokenizer): import ftfy from spacy.lang.en import English _nlp = English() - self.nlp = nlp.Defaults.create_tokenizer(_nlp) + self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") From ad6e62cd827d546691845aca5fb9b437c5812d6a Mon Sep 17 00:00:00 2001 From: Nikolay Korolev Date: Tue, 20 Aug 2019 15:43:06 +0300 Subject: [PATCH 29/36] Fix typo. configuratoin -> configuration --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7d2445fc11..4e57de5842 100644 --- a/README.md +++ b/README.md @@ -328,7 +328,7 @@ Breaking change in the `from_pretrained()`method: 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules. -2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuratoin class attributes. +2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes. Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before. From 43489756ad421a99d0f3eb9d83116b9b4904c922 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 20 Aug 2019 16:59:11 +0200 Subject: [PATCH 30/36] adding proxies options for the from_pretrained methods --- .gitignore | 4 ++- pytorch_transformers/file_utils.py | 29 +++++++++++----------- pytorch_transformers/modeling_utils.py | 14 +++++++++-- pytorch_transformers/tokenization_utils.py | 7 +++++- 4 files changed, 36 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 6bbe32df6c..466a167552 100644 --- a/.gitignore +++ b/.gitignore @@ -127,4 +127,6 @@ proc_data # examples runs -examples/runs \ No newline at end of file +examples/runs + +data \ No newline at end of file diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py index 074e6743ef..f6f2151b12 100644 --- a/pytorch_transformers/file_utils.py +++ b/pytorch_transformers/file_utils.py @@ -17,8 +17,9 @@ from hashlib import sha256 from io import open import boto3 -import requests +from botocore.config import Config from botocore.exceptions import ClientError +import requests from tqdm import tqdm try: @@ -93,7 +94,7 @@ def filename_to_url(filename, cache_dir=None): return url, etag -def cached_path(url_or_filename, cache_dir=None, force_download=False): +def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None): """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and @@ -114,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False): if parsed.scheme in ('http', 'https', 's3'): # URL, so get it from the cache (downloading if necessary) - return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download) + return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies) elif os.path.exists(url_or_filename): # File, and it exists. return url_or_filename @@ -159,24 +160,24 @@ def s3_request(func): @s3_request -def s3_etag(url): +def s3_etag(url, proxies=None): """Check ETag on S3 object.""" - s3_resource = boto3.resource("s3") + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) bucket_name, s3_path = split_s3_path(url) s3_object = s3_resource.Object(bucket_name, s3_path) return s3_object.e_tag @s3_request -def s3_get(url, temp_file): +def s3_get(url, temp_file, proxies=None): """Pull a file directly from S3.""" - s3_resource = boto3.resource("s3") + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) bucket_name, s3_path = split_s3_path(url) s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) -def http_get(url, temp_file): - req = requests.get(url, stream=True) +def http_get(url, temp_file, proxies=None): + req = requests.get(url, stream=True, proxies=proxies) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = tqdm(unit="B", total=total) @@ -187,7 +188,7 @@ def http_get(url, temp_file): progress.close() -def get_from_cache(url, cache_dir=None, force_download=False): +def get_from_cache(url, cache_dir=None, force_download=False, proxies=None): """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. @@ -204,10 +205,10 @@ def get_from_cache(url, cache_dir=None, force_download=False): # Get eTag to add to filename, if it exists. if url.startswith("s3://"): - etag = s3_etag(url) + etag = s3_etag(url, proxies=proxies) else: try: - response = requests.head(url, allow_redirects=True) + response = requests.head(url, allow_redirects=True, proxies=proxies) if response.status_code != 200: etag = None else: @@ -238,9 +239,9 @@ def get_from_cache(url, cache_dir=None, force_download=False): # GET file object if url.startswith("s3://"): - s3_get(url, temp_file) + s3_get(url, temp_file, proxies=proxies) else: - http_get(url, temp_file) + http_get(url, temp_file, proxies=proxies) # we are copying the file before closing it, so flush to avoid truncation temp_file.flush() diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index 3e4fbca132..f1501aa8d5 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -128,6 +128,10 @@ class PretrainedConfig(object): force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + return_unused_kwargs: (`optional`) bool: - If False, then this function returns just the final configuration object. @@ -150,6 +154,7 @@ class PretrainedConfig(object): """ cache_dir = kwargs.pop('cache_dir', None) force_download = kwargs.pop('force_download', False) + proxies = kwargs.pop('proxies', None) return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) if pretrained_model_name_or_path in cls.pretrained_config_archive_map: @@ -160,7 +165,7 @@ class PretrainedConfig(object): config_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: - resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download) + resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) except EnvironmentError: if pretrained_model_name_or_path in cls.pretrained_config_archive_map: logger.error( @@ -407,6 +412,10 @@ class PreTrainedModel(nn.Module): force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. @@ -432,6 +441,7 @@ class PreTrainedModel(nn.Module): cache_dir = kwargs.pop('cache_dir', None) from_tf = kwargs.pop('from_tf', False) force_download = kwargs.pop('force_download', False) + proxies = kwargs.pop('proxies', None) output_loading_info = kwargs.pop('output_loading_info', False) # Load config @@ -462,7 +472,7 @@ class PreTrainedModel(nn.Module): archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download) + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) except EnvironmentError: if pretrained_model_name_or_path in cls.pretrained_model_archive_map: logger.error( diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 763c0cee04..68af97a518 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -196,6 +196,10 @@ class PreTrainedTokenizer(object): force_download: (`optional`) boolean, default False: Force to (re-)download the vocabulary files and override the cached versions if they exists. + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details. @@ -227,6 +231,7 @@ class PreTrainedTokenizer(object): def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): cache_dir = kwargs.pop('cache_dir', None) force_download = kwargs.pop('force_download', False) + proxies = kwargs.pop('proxies', None) s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} @@ -287,7 +292,7 @@ class PreTrainedTokenizer(object): if file_path is None: resolved_vocab_files[file_id] = None else: - resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download) + resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies) except EnvironmentError: if pretrained_model_name_or_path in s3_models: logger.error("Couldn't reach server to download vocabulary.") From 3bffd2e8e5d726d581e0a66746b25c64d49e231d Mon Sep 17 00:00:00 2001 From: Peng Qi Date: Tue, 20 Aug 2019 10:59:28 -0700 Subject: [PATCH 31/36] more fixes --- examples/run_glue.py | 2 +- examples/run_squad.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index 7fb0732e61..1729f4f7e3 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -467,7 +467,7 @@ def main(): # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) - tokenizer = tokenizer_class.from_pretrained(args.output_dir) + tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) diff --git a/examples/run_squad.py b/examples/run_squad.py index efa835107c..c0586b03bd 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -481,7 +481,7 @@ def main(): # Save the trained model and the tokenizer - if args.do_train and args.local_rank == -1 or torch.distributed.get_rank() == 0: + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) From aa05dc8935a3e5b349abecbdc5399796578fe965 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 21 Aug 2019 02:29:34 +0200 Subject: [PATCH 32/36] adding gpt-2 large --- pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py | 2 +- .../convert_openai_checkpoint_to_pytorch.py | 2 +- .../convert_transfo_xl_checkpoint_to_pytorch.py | 2 +- pytorch_transformers/modeling_gpt2.py | 6 ++++-- pytorch_transformers/tokenization_gpt2.py | 2 ++ 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py index f9e83f5d5b..e9bfa0302a 100755 --- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py @@ -35,7 +35,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p if gpt2_config_file == "": config = GPT2Config() else: - config = GPT2Config(gpt2_config_file) + config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py index 70895b4002..3009f8a99e 100755 --- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py @@ -35,7 +35,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c if openai_config_file == "": config = OpenAIGPTConfig() else: - config = OpenAIGPTConfig(openai_config_file) + config = OpenAIGPTConfig.from_json_file(openai_config_file) model = OpenAIGPTModel(config) # Load weights from numpy diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py index 5733146444..7e79d58d7d 100755 --- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py @@ -75,7 +75,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, if transfo_xl_config_file == "": config = TransfoXLConfig() else: - config = TransfoXLConfig(transfo_xl_config_file) + config = TransfoXLConfig.from_json_file(transfo_xl_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = TransfoXLLMHeadModel(config) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index cb4b8cc4ab..9022048d6d 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -38,9 +38,11 @@ from .modeling_bert import BertLayerNorm as LayerNorm logger = logging.getLogger(__name__) GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin", - "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"} + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"} GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", - "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"} + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"} def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): """ Load tf checkpoints in a pytorch model diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index 0aee856180..4016a85a7f 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -45,11 +45,13 @@ PRETRAINED_VOCAB_FILES_MAP = { { 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", + 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", }, 'merges_file': { 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", + 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", }, } From fdc487d8b33dcb8b2ddebd7a1fe4bd0eee4e2a40 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 21 Aug 2019 02:35:01 +0200 Subject: [PATCH 33/36] Add max length --- pytorch_transformers/tokenization_gpt2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index 4016a85a7f..e67f25ff59 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -58,6 +58,7 @@ PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'gpt2': 1024, 'gpt2-medium': 1024, + 'gpt2-large': 1024, } @lru_cache() From 6f877d9daf36788bad4fd228930939fed6ab12bd Mon Sep 17 00:00:00 2001 From: VictorSanh Date: Wed, 21 Aug 2019 03:43:29 +0000 Subject: [PATCH 34/36] Update dev results on GLUE (bert-base-uncased) w/ median on 5 runs --- docs/source/examples.rst | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 51c8d850b9..7777117b47 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -68,7 +68,9 @@ GLUE results on dev set ~~~~~~~~~~~~~~~~~~~~~~~ We get the following results on the dev set of GLUE benchmark with an uncased BERT base -model. All experiments were run on a P100 GPU with a batch size of 32. +model (`bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train batch size of 24. Some of +these tasks have a small dataset and training can lead to high variance in the results between different runs. +We report the median on 5 runs (with different seeds) for each of the metrics. .. list-table:: :header-rows: 1 @@ -78,31 +80,31 @@ model. All experiments were run on a P100 GPU with a batch size of 32. - Result * - CoLA - Matthew's corr. - - 57.29 + - 55.75 * - SST-2 - accuracy - - 93.00 + - 92.09 * - MRPC - F1/accuracy - - 88.85/83.82 + - 90.48/86.27 * - STS-B - Pearson/Spearman corr. - - 89.70/89.37 + - 89.03/88.64 * - QQP - accuracy/F1 - - 90.72/87.41 + - 90.92/87.72 * - MNLI - matched acc./mismatched acc. - - 83.95/84.39 + - 83.74/84.06 * - QNLI - accuracy - - 89.04 + - 91.07 * - RTE - accuracy - - 61.01 + - 68.59 * - WNLI - accuracy - - 53.52 + - 43.66 Some of these results are significantly different from the ones reported on the test set From 2f9397139d1be373efa76b8133d71e1bdc43bbb3 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 21 Aug 2019 11:29:37 -0400 Subject: [PATCH 35/36] Added GPT-2 LARGE to Pre-trained Models documentation --- docs/source/pretrained_models.rst | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 6a14e3dcd1..7df70ea225 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -62,6 +62,9 @@ Here is the full list of the currently provided pretrained models together with | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``gpt2-medium`` | | 24-layer, 1024-hidden, 16-heads, 345M parameters. | | | | | OpenAI's Medium-sized GPT-2 English model | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``gpt2-large`` | | 36-layer, 1280-hidden, 20-heads, 774M parameters. | +| | | | OpenAI's Large-sized GPT-2 English model | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | Transformer-XL | ``transfo-xl-wt103`` | | 18-layer, 1024-hidden, 16-heads, 257M parameters. | | | | | English model trained on wikitext-103 | @@ -72,16 +75,16 @@ Here is the full list of the currently provided pretrained models together with | | ``xlnet-large-cased`` | | 24-layer, 1024-hidden, 16-heads, 340M parameters. | | | | | XLNet Large English model | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| XLM | ``xlm-mlm-en-2048`` | | 12-layer, 2048-hidden, 16-heads | +| XLM | ``xlm-mlm-en-2048`` | | 12-layer, 2048-hidden, 16-heads | | | | | XLM English model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-mlm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads | +| | ``xlm-mlm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-German Multi-language model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-mlm-enfr-1024`` | | 6-layer, 1024-hidden, 8-heads | +| | ``xlm-mlm-enfr-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-French Multi-language model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-mlm-enro-1024`` | | 6-layer, 1024-hidden, 8-heads | +| | ``xlm-mlm-enro-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-Romanian Multi-language model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``xlm-mlm-xnli15-1024`` | | 12-layer, 1024-hidden, 8-heads | @@ -93,7 +96,7 @@ Here is the full list of the currently provided pretrained models together with | | ``xlm-clm-enfr-1024`` | | 12-layer, 1024-hidden, 8-heads | | | | | XLM English model trained with CLM (Causal Language Modeling) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``xlm-clm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads | +| | ``xlm-clm-ende-1024`` | | 6-layer, 1024-hidden, 8-heads | | | | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | RoBERTa | ``roberta-base`` | | 12-layer, 768-hidden, 12-heads, 125M parameters | From e00b4ff1de0591d5093407b16e665e5c86028f04 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 21 Aug 2019 22:22:17 +0200 Subject: [PATCH 36/36] fix #1017 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e57de5842..9751c720b8 100644 --- a/README.md +++ b/README.md @@ -393,8 +393,8 @@ for batch in train_data: loss = model(batch) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) - scheduler.step() optimizer.step() + scheduler.step() optimizer.zero_grad() ```