From 117ed92992a8b7ec45b399a2b5e2f9b66358a7d4 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 16 Jul 2019 11:58:47 -0400 Subject: [PATCH 01/23] RestructuredText table for pretrained models. --- docs/source/pretrained_models.rst | 147 +++++++++++++++++++----------- 1 file changed, 94 insertions(+), 53 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 2d72977951..e4ad7a6eaa 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -3,57 +3,98 @@ Pretrained models Here is the full list of the currently provided pretrained models together with a short presentation of each model. -+===============+============================================================+===========================+ -| Architecture | Shortcut name | Details of the model | -+===============+============================================================+===========================+ -| | ``bert-base-uncased`` | 12-layer, 768-hidden, 12-heads, 110M parameters -| | | Trained on lower-cased English text | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-large-uncased`` | 24-layer, 1024-hidden, 16-heads, 340M parameters -| | | Trained on lower-cased English text | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-base-cased`` | 12-layer, 768-hidden, 12-heads, 110M parameters -| | | Trained on cased English text | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-large-cased`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | -| | | Trained on cased English text | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-base-multilingual-uncased`` | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters -| | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias -| | | (see `details `_) | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-base-multilingual-cased`` | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters | -| | | Trained on cased text in the top 104 languages with the largest Wikipedias -| | | (see `details `_) | -| +------------------------------------------------------------+---------------------------+ -| BERT | ``bert-base-chinese`` | 12-layer, 768-hidden, 12-heads, 110M parameters | -| | | Trained on cased Chinese Simplified and Traditional text | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-base-german-cased`` | 12-layer, 768-hidden, 12-heads, 110M parameters | -| | | Trained on cased German text by Deepset.ai | -| | | (see `details on deepset.ai website `_) | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-large-uncased-whole-word-masking`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | -| | | Trained on lower-cased English text using Whole-Word-Masking | -| | | (see `details `_) | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-large-cased-whole-word-masking`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | -| | | Trained on cased English text using Whole-Word-Masking | -| | | (see `details `_) | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | -| | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD | -| | | (see details of fine-tuning in the `example section`_) | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-large-cased-whole-word-masking-finetuned-squad`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | -| | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD | -| | | (see `details of fine-tuning in the example section `_) | -| +------------------------------------------------------------+---------------------------+ -| | ``bert-base-cased-finetuned-mrpc`` | 12-layer, 768-hidden, 12-heads, 110M parameters | -| | | The ``bert-base-cased`` model fine-tuned on MRPC | -| | | (see `details of fine-tuning in the example section `_) | -+---------------+------------------------------------------------------------+---------------------------+ -| GPT | Cells may span columns. | -+---------------+----------------------------------------------------------------------------------------+ -.. `_ \ No newline at end of file ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| Architecture | Shortcut name | Details of the model | ++===================+============================================================+===========================================================================================================================+ +| BERT | ``bert-base-uncased`` | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | Trained on lower-cased English text | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-large-uncased`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | +| | | Trained on lower-cased English text | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-cased`` | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | Trained on cased English text | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-large-cased`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | +| | | Trained on cased English text | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-multilingual-uncased`` | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-multilingual-cased`` | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | Trained on cased text in the top 104 languages with the largest Wikipedias | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-chinese`` | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | Trained on cased Chinese Simplified and Traditional text | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-german-cased`` | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | Trained on cased German text by Deepset.ai | +| | | (see `details on deepset.ai website `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-large-uncased-whole-word-masking`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | +| | | Trained on lower-cased English text using Whole-Word-Masking | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-large-cased-whole-word-masking`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | +| | | Trained on cased English text using Whole-Word-Masking | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | +| | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD | +| | | (see details of fine-tuning in the `example section`__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-large-cased-whole-word-masking-finetuned-squad`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | +| | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD | +| | | (see `details of fine-tuning in the example section `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-cased-finetuned-mrpc`` | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | The ``bert-base-cased`` model fine-tuned on MRPC | +| | | (see `details of fine-tuning in the example section `__) | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| GPT | ``openai-gpt`` | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | OpenAI GPT English model | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| GPT-2 | ``gpt2`` | 12-layer, 768-hidden, 12-heads, 117M parameters | +| | | OpenAI GPT-2 English model | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``gpt2-medium`` | 24-layer, 1024-hidden, 16-heads, 345M parameters | +| | | OpenAI's Medium-sized GPT-2 English model | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| Transformer-XL | ``transfo-xl-wt103`` | 18-layer, 1024-hidden, 16-heads, 257M parameters | +| | | English model trained on wikitext-103 | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| XLNet | ``xlnet-base-cased`` | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | XLNet English model | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlnet-large-cased`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | +| | | XLNet Large English model | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| XLM | ``xlm-mlm-en-2048`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM English model | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-mlm-ende-1024`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM English-German Multi-language model | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-mlm-enfr-1024`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM English-French Multi-language model | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-mlm-enro-1024`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM English-Romanian Multi-language model | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-mlm-xnli15-1024`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM Model pre-trained with MLM on the `15 XNLI languages`__. | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-mlm-tlm-xnli15-1024`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages`__. | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-clm-enfr-1024`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM English model trained with CLM (Causal Language Modeling) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-clm-ende-1024`` | 12-layer, 1024-hidden, 8-heads | +| | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling) | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ + +.. `__ \ No newline at end of file From 9d381e7be9fb97e09777fa66aa3e336ca132af70 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 17 Jul 2019 09:25:38 -0400 Subject: [PATCH 02/23] Fixed incorrect links in the PretrainedModel --- docs/source/pretrained_models.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index e4ad7a6eaa..b23a96ff7c 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -43,8 +43,8 @@ Here is the full list of the currently provided pretrained models together with | | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ | | ``bert-large-uncased-whole-word-masking-finetuned-squad`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | -| | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD | -| | | (see details of fine-tuning in the `example section`__) | +| | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD (see details of fine-tuning in the | +| | | `example section `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ | | ``bert-large-cased-whole-word-masking-finetuned-squad`` | 24-layer, 1024-hidden, 16-heads, 340M parameters | | | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD | @@ -85,10 +85,10 @@ Here is the full list of the currently provided pretrained models together with | | | XLM English-Romanian Multi-language model | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ | | ``xlm-mlm-xnli15-1024`` | 12-layer, 1024-hidden, 8-heads | -| | | XLM Model pre-trained with MLM on the `15 XNLI languages`__. | +| | | XLM Model pre-trained with MLM on the `15 XNLI languages `__. | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ | | ``xlm-mlm-tlm-xnli15-1024`` | 12-layer, 1024-hidden, 8-heads | -| | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages`__. | +| | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages `__. | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+ | | ``xlm-clm-enfr-1024`` | 12-layer, 1024-hidden, 8-heads | | | | XLM English model trained with CLM (Causal Language Modeling) | From 1383c7b87af19bf21adf19d66cf6ee1a80555ea4 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 23 Jul 2019 17:52:20 +0200 Subject: [PATCH 03/23] Fix #869 --- pytorch_transformers/modeling_utils.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index 324cdc17c9..3f1df0a49d 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -39,6 +39,20 @@ WEIGHTS_NAME = "pytorch_model.bin" TF_WEIGHTS_NAME = 'model.ckpt' +try: + from torch.nn import Identity +except ImportError: + # Older PyTorch compatibility + class Identity(nn.Module): + r"""A placeholder identity operator that is argument-insensitive. + """ + def __init__(self, *args, **kwargs): + super(Identity, self).__init__() + + def forward(self, input): + return input + + if not six.PY2: def add_start_docstrings(*docstr): def docstring_decorator(fn): @@ -731,7 +745,7 @@ class SequenceSummary(nn.Module): # We can probably just use the multi-head attention module of PyTorch >=1.1.0 raise NotImplementedError - self.summary = nn.Identity() + self.summary = Identity() if hasattr(config, 'summary_use_proj') and config.summary_use_proj: if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0: num_classes = config.num_labels @@ -739,15 +753,15 @@ class SequenceSummary(nn.Module): num_classes = config.hidden_size self.summary = nn.Linear(config.hidden_size, num_classes) - self.activation = nn.Identity() + self.activation = Identity() if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': self.activation = nn.Tanh() - self.first_dropout = nn.Identity() + self.first_dropout = Identity() if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0: self.first_dropout = nn.Dropout(config.summary_first_dropout) - self.last_dropout = nn.Identity() + self.last_dropout = Identity() if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0: self.last_dropout = nn.Dropout(config.summary_last_dropout) From a7fce6d9176cf3662d153af54270f345eb0bec8d Mon Sep 17 00:00:00 2001 From: Chi-Liang Liu Date: Wed, 24 Jul 2019 16:11:36 +0800 Subject: [PATCH 04/23] fix squad v1 error (na_prob_file should be None) --- examples/run_squad.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 36e03fb012..df8e3b4a82 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -241,7 +241,10 @@ def evaluate(args, model, tokenizer, prefix=""): # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) - output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) + if args.version_2_with_negative: + output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) + else: + output_null_log_odds_file = None if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure From 66b15f73f0caeadadf1c65c6e047ebb4285f1f7a Mon Sep 17 00:00:00 2001 From: rococo // Ron Date: Wed, 24 Jul 2019 11:27:08 -0700 Subject: [PATCH 05/23] Update docs for parameter rename OpenAIGPTLMHeadModel now accepts `labels` instead of `lm_labels` --- pytorch_transformers/modeling_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index 4ea19a965d..17a46fa470 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -538,7 +538,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for language modeling. - Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` From ae152cec09b496101841dcbc59613cc7a3d133a4 Mon Sep 17 00:00:00 2001 From: Joel Grus Date: Wed, 24 Jul 2019 16:54:48 -0700 Subject: [PATCH 06/23] make save_pretrained work with added tokens right now it's dumping the *decoder* when it should be dumping the *encoder*. this fixes that. --- pytorch_transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index f603a29d74..858edc7c50 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -266,7 +266,7 @@ class PreTrainedTokenizer(object): with open(added_tokens_file, 'w', encoding='utf-8') as f: if self.added_tokens_encoder: - out_str = json.dumps(self.added_tokens_decoder, ensure_ascii=False) + out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) else: out_str = u"{}" f.write(out_str) From adb3ef636877586ab64ea9be97f3407433d053d8 Mon Sep 17 00:00:00 2001 From: zijunsun Date: Thu, 25 Jul 2019 13:09:10 +0800 Subject: [PATCH 07/23] multi-gpu training also should be after apex fp16 --- examples/run_glue.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index 5d9abd06fc..0d4ffaa390 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -92,6 +92,10 @@ def train(args, train_dataset, model, tokenizer): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) + # multi-gpu training (should be after apex fp16 initialization) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], @@ -418,8 +422,6 @@ def main(): torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) logger.info("Training/evaluation parameters %s", args) From 35c52f2f3cf85e26a85a7c52cff789983edaa62c Mon Sep 17 00:00:00 2001 From: Sukuya Date: Thu, 25 Jul 2019 16:51:11 +0800 Subject: [PATCH 08/23] Update torchscript.rst Import fixed to pytorch_transformers else torchscript flag can't be used. --- docs/source/torchscript.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst index 1b84559567..e207068fad 100644 --- a/docs/source/torchscript.rst +++ b/docs/source/torchscript.rst @@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename `` .. code-block:: python - from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig + from pytorch_transformers import BertModel, BertTokenizer, BertConfig import torch enc = BertTokenizer.from_pretrained("bert-base-uncased") @@ -129,4 +129,4 @@ Using the traced model for inference is as simple as using its ``__call__`` dund .. code-block:: python - traced_model(tokens_tensor, segments_tensors) \ No newline at end of file + traced_model(tokens_tensor, segments_tensors) From f0aeb7a814289a64a5b22577415a0cfcde3c7870 Mon Sep 17 00:00:00 2001 From: zijunsun Date: Fri, 26 Jul 2019 15:23:29 +0800 Subject: [PATCH 09/23] =?UTF-8?q?multi-gpu=20training=20also=20should=20be?= =?UTF-8?q?=20after=20apex=20fp16=EF=BC=88squad=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/run_squad.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 36e03fb012..692cb4a20c 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -101,6 +101,10 @@ def train(args, train_dataset, model, tokenizer): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) + # multi-gpu training (should be after apex fp16 initialization) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], @@ -457,8 +461,6 @@ def main(): torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) logger.info("Training/evaluation parameters %s", args) From edfd965ac8a5446adb2c94ad043263b3144b3f95 Mon Sep 17 00:00:00 2001 From: David Pollack Date: Fri, 26 Jul 2019 14:13:46 +0200 Subject: [PATCH 10/23] fix convert_to_tf --- .../convert_pytorch_checkpoint_to_tf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py index b8858ee3dc..a2e7b5c41a 100644 --- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py @@ -72,11 +72,11 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s return 'bert/{}'.format(name) def assign_tf_var(tensor:np.ndarray, name:str): - tmp_var = tf.Variable(initial_value=tensor) - tf_var = tf.get_variable(dtype=tmp_var.dtype, shape=tmp_var.shape, name=name) - op = tf.assign(ref=tf_var, value=tmp_var) - session.run(tf.variables_initializer([tmp_var, tf_var])) - session.run(fetches=[op, tf_var]) + tf_dtype = tf.dtypes.as_dtype(tensor.dtype) + tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name) + session.run(tf.variables_initializer([tf_var])) + tf.keras.backend.set_value(tf_var, tensor) + session.run(tf_var) return tf_var for var_name in state_dict: From 09ecf225e9ac00f78ecf9246957128f5d7d79a52 Mon Sep 17 00:00:00 2001 From: David Pollack Date: Fri, 26 Jul 2019 15:20:44 +0200 Subject: [PATCH 11/23] fixed the fix. tf session madness. --- .../convert_pytorch_checkpoint_to_tf.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py index a2e7b5c41a..c24dddc4d6 100644 --- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py @@ -62,34 +62,34 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) - session = tf.Session() state_dict = model.state_dict() - tf_vars = [] def to_tf_var_name(name:str): for patt, repl in iter(var_map): name = name.replace(patt, repl) return 'bert/{}'.format(name) - def assign_tf_var(tensor:np.ndarray, name:str): + def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) - tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name) + tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) - tf.keras.backend.set_value(tf_var, tensor) session.run(tf_var) return tf_var - for var_name in state_dict: - tf_name = to_tf_var_name(var_name) - torch_tensor = state_dict[var_name].numpy() - if any([x in var_name for x in tensors_to_transopse]): - torch_tensor = torch_tensor.T - tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name) - tf_vars.append(tf_tensor) - print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name)))) + tf.reset_default_graph() + with tf.Session() as session: + for var_name in state_dict: + tf_name = to_tf_var_name(var_name) + torch_tensor = state_dict[var_name].numpy() + if any([x in var_name for x in tensors_to_transopse]): + torch_tensor = torch_tensor.T + tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) + tf.keras.backend.set_value(tf_var, torch_tensor) + tf_weight = session.run(tf_var) + print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) - saver = tf.train.Saver(tf_vars) - saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) + saver = tf.train.Saver(tf.trainable_variables()) + saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) def main(raw_args=None): From 632d711411d2126e90cd4657f411a09bc180f561 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 26 Jul 2019 21:14:37 +0200 Subject: [PATCH 12/23] fix #908 --- pytorch_transformers/__init__.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index f875e4ab18..b4b957192c 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -7,20 +7,20 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_xlm import XLMTokenizer from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization) -from .modeling_bert import (BertConfig, BertModel, BertForPreTraining, - BertForMaskedLM, BertForNextSentencePrediction, - BertForSequenceClassification, BertForMultipleChoice, - BertForTokenClassification, BertForQuestionAnswering, - load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - BERT_PRETRAINED_CONFIG_ARCHIVE_MAP) -from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel, +from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining, + BertForMaskedLM, BertForNextSentencePrediction, + BertForSequenceClassification, BertForMultipleChoice, + BertForTokenClassification, BertForQuestionAnswering, + load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP) +from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel, +from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_gpt2 import (GPT2Config, GPT2Model, +from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) @@ -29,7 +29,7 @@ from .modeling_xlnet import (XLNetConfig, XLNetForSequenceClassification, XLNetForQuestionAnswering, load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_xlm import (XLMConfig, XLMModel, +from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP) From 7b6e474c9acc26962363e78ef95fdb6f006eb0b4 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 26 Jul 2019 21:26:44 +0200 Subject: [PATCH 13/23] fix #901 --- pytorch_transformers/tokenization_utils.py | 28 ++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 858edc7c50..e2fe46320e 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -160,26 +160,46 @@ class PreTrainedTokenizer(object): s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} if pretrained_model_name_or_path in s3_models: + # Get the vocabulary from AWS S3 bucket for file_id, map_list in cls.pretrained_vocab_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] else: + # Get the vocabulary from local files logger.info( "Model name '{}' not found in model shortcut name list ({}). " "Assuming '{}' is a path or url to a directory containing tokenizer files.".format( pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path)) - all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, - 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} - all_vocab_files_names.update(cls.vocab_files_names) - for file_id, file_name in all_vocab_files_names.items(): + + # Look for the tokenizer main vocabulary files + for file_id, file_name in cls.vocab_files_names.items(): if os.path.isdir(pretrained_model_name_or_path): + # If a directory is provided we look for the standard filenames full_file_name = os.path.join(pretrained_model_name_or_path, file_name) else: + # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) full_file_name = pretrained_model_name_or_path if not os.path.exists(full_file_name): logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) full_file_name = None vocab_files[file_id] = full_file_name + + # Look for the additional tokens files + all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, + 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} + + # If a path to a file was provided, get the parent directory + saved_directory = pretrained_model_name_or_path + if os.path.exists(saved_directory) and not os.path.isdir(saved_directory): + saved_directory = os.path.dirname(saved_directory) + + for file_id, file_name in all_vocab_files_names.items(): + full_file_name = os.path.join(saved_directory, file_name) + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None + vocab_files[file_id] = full_file_name + if all(full_file_name is None for full_file_name in vocab_files.values()): logger.error( "Model name '{}' was not found in model name list ({}). " From c90119e5430954abc9e852dd334d90d3ca906eb1 Mon Sep 17 00:00:00 2001 From: David Pollack Date: Mon, 29 Jul 2019 16:56:02 +0200 Subject: [PATCH 14/23] spelling mistake --- pytorch_transformers/convert_pytorch_checkpoint_to_tf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py index c24dddc4d6..025c2f396c 100644 --- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py @@ -41,7 +41,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s N BertForQuestionAnswering """ - tensors_to_transopse = ( + tensors_to_transpose = ( "dense.weight", "attention.self.query", "attention.self.key", @@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() - if any([x in var_name for x in tensors_to_transopse]): + if any([x in var_name for x in tensors_to_transpose]): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf.keras.backend.set_value(tf_var, torch_tensor) From 769bb643ce4e6d5836b41b41430ce02473907ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Ch=C3=A2tel?= Date: Wed, 31 Jul 2019 16:17:15 +0200 Subject: [PATCH 15/23] Fixing a broken link. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e2074f727..a4905e5854 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ These implementations have been tested on several datasets (see the example scri | Section | Description | |-|-| | [Installation](#installation) | How to install the package | -| [Quick tour: Usage](#quick-tour-usage) | Tokenizers & models usage: Bert and GPT-2 | +| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 | | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers | | [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more | From 97091acb8c5bd192a354375e58352694007b2390 Mon Sep 17 00:00:00 2001 From: Pierric Cistac Date: Wed, 31 Jul 2019 10:37:56 -0400 Subject: [PATCH 16/23] Small spelling fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a4905e5854..54c0ac94a1 100644 --- a/README.md +++ b/README.md @@ -283,7 +283,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters. -The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/). +The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/). In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`. From f2a3eb987e1fc2c85320fc3849c67811f5736b50 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Wed, 31 Jul 2019 11:05:06 -0400 Subject: [PATCH 17/23] Fix small typos --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 54c0ac94a1..7365e02a09 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ python ./examples/run_glue.py \ --warmup_steps=120 ``` -On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should results in a Pearson correlation coefficient of `+0.917` on the development set. +On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should result in a Pearson correlation coefficient of `+0.917` on the development set. #### Fine-tuning Bert model on the MRPC classification task @@ -264,7 +264,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s ### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet A conditional generation script is also included to generate text from a prompt. -The generation script include the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer). +The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer). Here is how to run the script with the small version of OpenAI GPT-2 model: From 44dd941efb602433b7edc29612cbdd0a03bf14dc Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 31 Jul 2019 21:09:04 -0400 Subject: [PATCH 18/23] link to `swift-coreml-transformers` --- README.md | 10 ++++++++++ docs/source/installation.rst | 13 +++++++++++++ 2 files changed, 23 insertions(+) diff --git a/README.md b/README.md index 7365e02a09..703eb47df9 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,16 @@ python -m pytest -sv ./pytorch_transformers/tests/ python -m pytest -sv ./examples/ ``` +### Do you want to run a Transformer model on a mobile device? + +You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo. + +It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices. + +At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML, +or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting! + + ## Quick tour Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/). diff --git a/docs/source/installation.rst b/docs/source/installation.rst index f8beb9f1c8..3a4663da0b 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -50,3 +50,16 @@ If you want to reproduce the original tokenization process of the ``OpenAI GPT`` python -m spacy download en If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry). + + +Do you want to run a Transformer model on a mobile device? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You should check out our `swift-coreml-transformers `_ repo. + +It contains an example of a conversion script from a Pytorch trained Transformer model (here, ``GPT-2``) to a CoreML model that runs on iOS devices. + +It also contains an implementation of BERT for Question answering. + +At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML, +or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting! From 7c524d631e4c0fd0531d02d6a155fc95a3e90810 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 5 Aug 2019 16:25:54 +0200 Subject: [PATCH 19/23] add issue templates --- .github/ISSUE_TEMPLATE/bug-report.md | 36 +++++++++++++++++++ .github/ISSUE_TEMPLATE/feature-request.md | 16 +++++++++ .github/ISSUE_TEMPLATE/migration.md | 43 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/question-help.md | 8 +++++ 4 files changed, 103 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug-report.md create mode 100644 .github/ISSUE_TEMPLATE/feature-request.md create mode 100644 .github/ISSUE_TEMPLATE/migration.md create mode 100644 .github/ISSUE_TEMPLATE/question-help.md diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md new file mode 100644 index 0000000000..0d9439887b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -0,0 +1,36 @@ +--- +name: "\U0001F41B Bug Report" +about: Submit a bug report to help us improve PyTorch Transformers +--- + +## πŸ› Bug + + + +## To Reproduce + +Steps to reproduce the behavior: + +1. +2. +3. + + + +## Expected behavior + + + +## Environment + +* OS: +* Python version: +* PyTorch version: +* PyTorch Transformers version (or branch): +* Using GPU ? +* Distributed of parallel setup ? +* Any other relevant information: + +## Additional context + + \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 0000000000..828e3737be --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,16 @@ +--- +name: "\U0001F680 Feature Request" +about: Submit a proposal/request for a new PyTorch Transformers feature +--- + +## πŸš€ Feature + + + +## Motivation + + + +## Additional context + + \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md new file mode 100644 index 0000000000..9a8b19dffa --- /dev/null +++ b/.github/ISSUE_TEMPLATE/migration.md @@ -0,0 +1,43 @@ +--- +name: "\U0001F4DA Migration from PyTorch-pretrained-Bert" +about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers +--- + +## πŸ“š Migration + + + +Model I am using (Bert, XLNet....): + +The problem arise when using: +* [ ] the official example scripts +* [ ] my own modified scripts + +The tasks I am working on is: +* [ ] an official GLUE/SQUaD task: (give the name) +* [ ] my own task or dataset: (give details) + +Language I am using the model on (English, Chinese....): + +Details of the issue: + + + +## Environment + +* OS: +* Python version: +* PyTorch version: +* PyTorch Transformers version (or branch): +* Using GPU ? +* Distributed of parallel setup ? +* Any other relevant information: + +## Checklist + +- [ ] I have read the migration guide in the readme. +- [ ] I checked if a related official extension example runs on my machine. + +## Additional context + + \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/question-help.md b/.github/ISSUE_TEMPLATE/question-help.md new file mode 100644 index 0000000000..8c76994b02 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question-help.md @@ -0,0 +1,8 @@ +--- +name: "❓Questions & Help" +about: Start a general discussion related to PyTorch Transformers +--- + +## ❓ Questions & Help + + \ No newline at end of file From 077ad693e9c3b5702ba9874f7a0f0ed8099c9773 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 5 Aug 2019 16:46:29 +0200 Subject: [PATCH 20/23] tweak issue templates wordings --- .github/ISSUE_TEMPLATE/bug-report.md | 14 +++++++++++++- .github/ISSUE_TEMPLATE/migration.md | 10 +++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 0d9439887b..66f7831aea 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -5,7 +5,19 @@ about: Submit a bug report to help us improve PyTorch Transformers ## πŸ› Bug - + + +Model I am using (Bert, XLNet....): + +Language I am using the model on (English, Chinese....): + +The problem arise when using: +* [ ] the official example scripts: (give details) +* [ ] my own modified scripts: (give details) + +The tasks I am working on is: +* [ ] an official GLUE/SQUaD task: (give the name) +* [ ] my own task or dataset: (give details) ## To Reproduce diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md index 9a8b19dffa..cf0c9a4757 100644 --- a/.github/ISSUE_TEMPLATE/migration.md +++ b/.github/ISSUE_TEMPLATE/migration.md @@ -5,20 +5,20 @@ about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-T ## πŸ“š Migration - + Model I am using (Bert, XLNet....): +Language I am using the model on (English, Chinese....): + The problem arise when using: -* [ ] the official example scripts -* [ ] my own modified scripts +* [ ] the official example scripts: (give details) +* [ ] my own modified scripts: (give details) The tasks I am working on is: * [ ] an official GLUE/SQUaD task: (give the name) * [ ] my own task or dataset: (give details) -Language I am using the model on (English, Chinese....): - Details of the issue: From 70c10caa06d9feda3f446d0a82655f56cd2afdab Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 5 Aug 2019 17:09:37 +0200 Subject: [PATCH 21/23] add option mentioned in #940 --- examples/run_glue.py | 6 ++++++ examples/run_squad.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/examples/run_glue.py b/examples/run_glue.py index 0d4ffaa390..a939ea373b 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -247,6 +247,9 @@ def evaluate(args, model, tokenizer, prefix=""): def load_and_cache_examples(args, task, tokenizer, evaluate=False): + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file @@ -273,6 +276,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) + if args.local_rank == 0: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) diff --git a/examples/run_squad.py b/examples/run_squad.py index 7d768d2c43..e62a1f1ff3 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -272,6 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""): def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( @@ -296,6 +299,9 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) + if args.local_rank == 0: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) From 7223886dc944b5476ea6be1a9838738644a2e9a1 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 5 Aug 2019 17:16:56 +0200 Subject: [PATCH 22/23] fix #944 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 703eb47df9..f3d2865ba8 100644 --- a/README.md +++ b/README.md @@ -385,6 +385,7 @@ for batch in train_data: loss.backward() scheduler.step() optimizer.step() + optimizer.zero_grad() ``` ## Citation From 3a126e73dd020be851d59cfcdc741fe3e8c6ad4f Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 5 Aug 2019 17:26:29 +0200 Subject: [PATCH 23/23] fix #950 --- .../convert_transfo_xl_checkpoint_to_pytorch.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py index b6672aedf7..5733146444 100755 --- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py +++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py @@ -24,11 +24,10 @@ from io import open import torch import pytorch_transformers.tokenization_transfo_xl as data_utils -from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME, - WEIGHTS_NAME, - TransfoXLConfig, - TransfoXLLMHeadModel, - load_tf_weights_in_transfo_xl) + +from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME +from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel, + load_tf_weights_in_transfo_xl) from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES) if sys.version_info[0] == 2: