From 814a3f4e017020d67ce69048e2647e7bf5d0784f Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 21 Aug 2019 14:11:14 -0400 Subject: [PATCH 01/17] Removed `attention_mask` from GPT-2 and GPT documentation. Corrected `multiple_choice_labels` to actual name `mc_labels` --- pytorch_transformers/modeling_gpt2.py | 10 +--------- pytorch_transformers/modeling_openai.py | 12 ++---------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index 9022048d6d..2980cf269a 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -408,10 +408,6 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: list of ``torch.FloatTensor`` (one for each layer): that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: @@ -642,10 +638,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): list of ``torch.FloatTensor`` (one for each layer): that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past` output below). Can be used to speed up sequential decoding. - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``: - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: @@ -656,7 +648,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` - **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: + **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index cfea768736..690aa7812b 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -415,11 +415,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: A parallel sequence of tokens (can be used to indicate various portions of the inputs). The embeddings from these tokens will be summed with the respective token embeddings. - Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices) **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: @@ -626,10 +622,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): A parallel sequence of tokens (can be used to indicate various portions of the inputs). The embeddings from these tokens will be summed with the respective token embeddings. Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``: - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: @@ -640,7 +632,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-1`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` - **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: + **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) From 034aa0c2d743f5de787b650d9ef38ea49c66df3f Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 21 Aug 2019 17:27:38 -0400 Subject: [PATCH 02/17] Fixed GPT2DoubleHeadsModel example and weight tying --- pytorch_transformers/modeling_gpt2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index 2980cf269a..af8d3ad10f 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -682,6 +682,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) + model.resize_token_embeddings(tokenizer.vocab_size + 1) choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1 @@ -696,6 +697,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): self.multiple_choice_head = SequenceSummary(config) self.apply(self.init_weights) + self.tie_weights() def tie_weights(self): """ Make sure we are sharing the input and output embeddings. From 47267ba556093e3cde815c655b14f9918df97dd3 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 21 Aug 2019 17:50:16 -0400 Subject: [PATCH 03/17] OpenAI GPT-2 now depends on CommonTests. --- .../tests/modeling_common_test.py | 3 + .../tests/modeling_gpt2_test.py | 195 ++++++++++++++++-- 2 files changed, 182 insertions(+), 16 deletions(-) diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py index e974ae865d..64a17f9db0 100644 --- a/pytorch_transformers/tests/modeling_common_test.py +++ b/pytorch_transformers/tests/modeling_common_test.py @@ -210,6 +210,9 @@ class CommonTestCases: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if "head_mask" in inputs_dict: + del inputs_dict["head_mask"] + for model_class in self.all_model_classes: config.output_attentions = True config.output_hidden_states = False diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py index 4e32cc37e1..fa8f79a40b 100644 --- a/pytorch_transformers/tests/modeling_gpt2_test.py +++ b/pytorch_transformers/tests/modeling_gpt2_test.py @@ -18,31 +18,194 @@ from __future__ import print_function import unittest import pytest +import shutil -from pytorch_transformers import (GPT2Config, GPT2Model, +from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2DoubleHeadsModel) -from .modeling_common_test import CommonTestCases, ConfigTester +from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor -class GPT2ModelTest(unittest.TestCase): + +class GPT2ModelTest(CommonTestCases.CommonModelTester): + + all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) + + class GPT2ModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = GPT2Config( + vocab_size_or_config_json_file=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + # intermediate_size=self.intermediate_size, + # hidden_act=self.hidden_act, + # hidden_dropout_prob=self.hidden_dropout_prob, + # attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + n_ctx=self.max_position_embeddings + # type_vocab_size=self.type_vocab_size, + # initializer_range=self.initializer_range + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels + + def check_loss_output(self, result): + self.parent.assertListEqual( + list(result["loss"].size()), + []) + + def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args): + model = GPT2Model(config=config) + model.eval() + + model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + model(input_ids, token_type_ids=token_type_ids) + sequence_output, presents = model(input_ids) + + result = { + "sequence_output": sequence_output, + "presents": presents, + } + self.parent.assertListEqual( + list(result["sequence_output"].size()), + [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertEqual(len(result["presents"]), config.n_layer) + + def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): + model = GPT2LMHeadModel(config) + model.eval() + + loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + + result = { + "loss": loss, + "lm_logits": lm_logits + } + + self.parent.assertListEqual( + list(result["loss"].size()), + []) + self.parent.assertListEqual( + list(result["lm_logits"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + + def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): + model = GPT2DoubleHeadsModel(config) + model.eval() + + loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids) + + result = { + "loss": loss, + "lm_logits": lm_logits + } + + self.parent.assertListEqual( + list(result["loss"].size()), + []) + self.parent.assertListEqual( + list(result["lm_logits"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs + inputs_dict = { + 'input_ids': input_ids, + 'token_type_ids': token_type_ids, + 'head_mask': head_mask + } + + return config, inputs_dict + + def setUp(self): + self.model_tester = GPT2ModelTest.GPT2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37) def test_config(self): - config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37) - config_tester.run_common_tests() + self.config_tester.run_common_tests() - def test_model(self): - model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, - lm_head_model_class=GPT2LMHeadModel, - double_head_model_class=GPT2DoubleHeadsModel) - model_tester.run_common_tests(test_presents=True) + def test_gpt2_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model(*config_and_inputs) + + def test_gpt2_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_gpt2_double_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs) @pytest.mark.slow - def test_pretrained(self): - model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, - lm_head_model_class=GPT2LMHeadModel, - double_head_model_class=GPT2DoubleHeadsModel) - model_tester.run_slow_tests() - + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_transformers_test/" + for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) if __name__ == "__main__": unittest.main() From 55f69a11b638a54164d3e61921d60a8455b8066d Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 21 Aug 2019 18:09:25 -0400 Subject: [PATCH 04/17] OpenAI GPT tests now extend CommonTests --- .../tests/modeling_gpt2_test.py | 2 + .../tests/modeling_openai_test.py | 195 ++++++++++++++++-- 2 files changed, 181 insertions(+), 16 deletions(-) diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py index fa8f79a40b..1786ada54c 100644 --- a/pytorch_transformers/tests/modeling_gpt2_test.py +++ b/pytorch_transformers/tests/modeling_gpt2_test.py @@ -207,5 +207,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model) + + if __name__ == "__main__": unittest.main() diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py index 243afb9501..0fcb4b7d64 100644 --- a/pytorch_transformers/tests/modeling_openai_test.py +++ b/pytorch_transformers/tests/modeling_openai_test.py @@ -18,31 +18,194 @@ from __future__ import print_function import unittest import pytest +import shutil -from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, - OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) +from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) -from .modeling_common_test import CommonTestCases, ConfigTester +from .modeling_common_test import CommonTestCases, ConfigTester, ids_tensor -class OpenAIModelTest(unittest.TestCase): + +class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): + + all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) + + class OpenAIGPTModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = OpenAIGPTConfig( + vocab_size_or_config_json_file=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + # intermediate_size=self.intermediate_size, + # hidden_act=self.hidden_act, + # hidden_dropout_prob=self.hidden_dropout_prob, + # attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + n_ctx=self.max_position_embeddings + # type_vocab_size=self.type_vocab_size, + # initializer_range=self.initializer_range + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels + + def check_loss_output(self, result): + self.parent.assertListEqual( + list(result["loss"].size()), + []) + + def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args): + model = OpenAIGPTModel(config=config) + model.eval() + + model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + model(input_ids, token_type_ids=token_type_ids) + (sequence_output,) = model(input_ids) + + result = { + "sequence_output": sequence_output + } + self.parent.assertListEqual( + list(result["sequence_output"].size()), + [self.batch_size, self.seq_length, self.hidden_size]) + + def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): + model = OpenAIGPTLMHeadModel(config) + model.eval() + + loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + + result = { + "loss": loss, + "lm_logits": lm_logits + } + + self.parent.assertListEqual( + list(result["loss"].size()), + []) + self.parent.assertListEqual( + list(result["lm_logits"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + + def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): + model = OpenAIGPTDoubleHeadsModel(config) + model.eval() + + loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids) + + result = { + "loss": loss, + "lm_logits": lm_logits + } + + self.parent.assertListEqual( + list(result["loss"].size()), + []) + self.parent.assertListEqual( + list(result["lm_logits"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs + inputs_dict = { + 'input_ids': input_ids, + 'token_type_ids': token_type_ids, + 'head_mask': head_mask + } + + return config, inputs_dict + + def setUp(self): + self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self) + self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37) def test_config(self): - config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37) - config_tester.run_common_tests() + self.config_tester.run_common_tests() - def test_model(self): - model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, - lm_head_model_class=OpenAIGPTLMHeadModel, - double_head_model_class=OpenAIGPTDoubleHeadsModel) - model_tester.run_common_tests(test_presents=False) + def test_openai_gpt_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs) + + def test_openai_gpt_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_openai_gpt_double_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs) @pytest.mark.slow - def test_pretrained(self): - model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, - lm_head_model_class=OpenAIGPTLMHeadModel, - double_head_model_class=OpenAIGPTDoubleHeadsModel) - model_tester.run_slow_tests() + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_transformers_test/" + for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() From 0517e7a1cb4a70bdf32f8d11b56df8d3911d1792 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 11:23:49 +0200 Subject: [PATCH 05/17] Fix GPT2 and RoBERTa tokenizer to beging with a space - update Roberta tokenizer --- pytorch_transformers/modeling_gpt2.py | 2 +- pytorch_transformers/tokenization_gpt2.py | 11 +- pytorch_transformers/tokenization_roberta.py | 112 +------------------ 3 files changed, 9 insertions(+), 116 deletions(-) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index af8d3ad10f..b63929db1c 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -682,7 +682,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) - model.resize_token_embeddings(tokenizer.vocab_size + 1) + model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings to the new vocabulary size (add a vector at the end) choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1 diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index e67f25ff59..eb56e7303e 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -109,11 +109,11 @@ class GPT2Tokenizer(PreTrainedTokenizer): bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) - self.encoder = json.load(open(vocab_file)) - self.decoder = {v:k for k,v in self.encoder.items()} - self.errors = errors # how to handle errors in decoding + self.encoder = json.load(open(vocab_file, encoding="utf-8")) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) @@ -169,6 +169,7 @@ class GPT2Tokenizer(PreTrainedTokenizer): def _tokenize(self, text): """ Tokenize a string. """ + text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with. bpe_tokens = [] for token in re.findall(self.pat, text): if sys.version_info[0] == 2: @@ -214,4 +215,4 @@ class GPT2Tokenizer(PreTrainedTokenizer): writer.write(' '.join(bpe_tokens) + u'\n') index += 1 - return vocab_file, merge_file + return vocab_file, merge_file \ No newline at end of file diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index edf4717c89..c77ad07934 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -23,8 +23,7 @@ import os import regex as re from io import open -from .tokenization_gpt2 import bytes_to_unicode, get_pairs -from .tokenization_utils import PreTrainedTokenizer +from .tokenization_gpt2 import GPT2Tokenizer try: from functools import lru_cache @@ -63,7 +62,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { } -class RobertaTokenizer(PreTrainedTokenizer): +class RobertaTokenizer(GPT2Tokenizer): """ RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE """ @@ -77,89 +76,6 @@ class RobertaTokenizer(PreTrainedTokenizer): sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs) - self.encoder = json.load(open(vocab_file, encoding="utf-8")) - self.decoder = {v: k for k, v in self.encoder.items()} - self.errors = errors # how to handle errors in decoding - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] - bpe_merges = [tuple(merge.split()) for merge in bpe_data] - self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) - self.cache = {} - - # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions - self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") - - @property - def vocab_size(self): - return len(self.encoder) - - def bpe(self, token): - if token in self.cache: - return self.cache[token] - word = tuple(token) - pairs = get_pairs(word) - - if not pairs: - return token - - while True: - bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - new_word.extend(word[i:j]) - i = j - except: - new_word.extend(word[i:]) - break - - if word[i] == first and i < len(word)-1 and word[i+1] == second: - new_word.append(first+second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = ' '.join(word) - self.cache[token] = word - return word - - def _tokenize(self, text): - """ Tokenize a string. """ - bpe_tokens = [] - for token in re.findall(self.pat, text): - if sys.version_info[0] == 2: - token = ''.join(self.byte_encoder[ord(b)] for b in token) - else: - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) - bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) - return bpe_tokens - - def _convert_token_to_id(self, token): - """ Converts a token (str/unicode) in an id using the vocab. """ - return self.encoder.get(token, self.encoder.get(self.unk_token)) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (string/unicode) using the vocab.""" - return self.decoder.get(index) - - def convert_tokens_to_string(self, tokens): - """ Converts a sequence of tokens (string) in a single string. """ - text = ''.join(tokens) - text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) - return text - def add_special_tokens_single_sentence(self, token_ids): """ Adds special tokens to a sequence for sequence classification tasks. @@ -175,27 +91,3 @@ class RobertaTokenizer(PreTrainedTokenizer): sep = [self._convert_token_to_id(self.sep_token)] cls = [self._convert_token_to_id(self.cls_token)] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def save_vocabulary(self, save_directory): - """Save the tokenizer vocabulary and merge files to a directory.""" - if not os.path.isdir(save_directory): - logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) - return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) - - with open(vocab_file, 'w', encoding='utf-8') as f: - f.write(json.dumps(self.encoder, ensure_ascii=False)) - - index = 0 - with open(merge_file, "w", encoding="utf-8") as writer: - writer.write(u'#version: 0.2\n') - for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!".format(merge_file)) - index = token_index - writer.write(' '.join(bpe_tokens) + u'\n') - index += 1 - - return vocab_file, merge_file From 50e6daf83abec67964e56ad8fce1477df92a7a3c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 11:27:43 +0200 Subject: [PATCH 06/17] fix Roberta tokenizer __init__ --- pytorch_transformers/tokenization_roberta.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index c77ad07934..26805d9f4e 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -72,7 +72,8 @@ class RobertaTokenizer(GPT2Tokenizer): def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): - super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, + super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors, + bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs) From 306af132d7059a8528345a2aab4f64215c87723b Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 11:30:51 +0200 Subject: [PATCH 07/17] update readme to mention add_special_tokens more clearly in example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e57de5842..dc51629b4a 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ for model_class, tokenizer_class, pretrained_weights in MODELS: model = model_class.from_pretrained(pretrained_weights) # Encode text - input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")]) + input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)]) # Add special tokens takes care of adding [CLS], [SEP], ... tokens in the right way for each model. with torch.no_grad(): last_hidden_states = model(input_ids)[0] # Models outputs are now tuples From d51f72d5decaf89c94720f41b48478b343663fc5 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 11:41:11 +0200 Subject: [PATCH 08/17] adding shortcut to the ids of all the special tokens --- pytorch_transformers/modeling_gpt2.py | 5 +- .../tests/tokenization_tests_commons.py | 4 +- pytorch_transformers/tokenization_utils.py | 56 +++++++++++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index b63929db1c..72b7a7df6f 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -679,13 +679,16 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): Examples:: + import torch + from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings to the new vocabulary size (add a vector at the end) choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices - mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1 + mc_token_ids = torch.tensor([input_ids.size(-1)]) # Batch size 1 outputs = model(input_ids, mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py index ebcf6f48d8..8d94355bea 100644 --- a/pytorch_transformers/tests/tokenization_tests_commons.py +++ b/pytorch_transformers/tests/tokenization_tests_commons.py @@ -128,8 +128,8 @@ class CommonTestCases: self.assertGreater(tokens[0], tokens[1]) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokens[-3]) - self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token)) - self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token)) + self.assertEqual(tokens[0], tokenizer.eos_token_id) + self.assertEqual(tokens[-2], tokenizer.eos_token_id) def test_required_methods_tokenizer(self): diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index d2855e0922..1116735d0e 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -155,6 +155,62 @@ class PreTrainedTokenizer(object): def additional_special_tokens(self, value): self._additional_special_tokens = value + @property + def bos_token_id(self): + """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ + if self._bos_token is None: + logger.error("Using bos_token, but it is not set yet.") + return self.convert_tokens_to_ids(self._bos_token) + + @property + def eos_token_id(self): + """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ + if self._eos_token is None: + logger.error("Using eos_token, but it is not set yet.") + return self.convert_tokens_to_ids(self._eos_token) + + @property + def unk_token_is(self): + """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ + if self._unk_token is None: + logger.error("Using unk_token, but it is not set yet.") + return self.convert_tokens_to_ids(self._unk_token) + + @property + def sep_token_id(self): + """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + if self._sep_token is None: + logger.error("Using sep_token, but it is not set yet.") + return self.convert_tokens_to_ids(self._sep_token) + + @property + def pad_token_id(self): + """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + if self._pad_token is None: + logger.error("Using pad_token, but it is not set yet.") + return self.convert_tokens_to_ids(self._pad_token) + + @property + def cls_token_id(self): + """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + if self._cls_token is None: + logger.error("Using cls_token, but it is not set yet.") + return self.convert_tokens_to_ids(self._cls_token) + + @property + def mask_token_id(self): + """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + if self._mask_token is None: + logger.error("Using mask_token, but it is not set yet.") + return self.convert_tokens_to_ids(self._mask_token) + + @property + def additional_special_tokens_ids(self): + """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ + if self._additional_special_tokens is None: + logger.error("Using additional_special_tokens, but it is not set yet.") + return self.convert_tokens_to_ids(self._additional_special_tokens) + def __init__(self, max_len=None, **kwargs): self._bos_token = None self._eos_token = None From 0f5a7994568289894e949eafc5831783be87174c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 11:49:23 +0200 Subject: [PATCH 09/17] fix GPT2DoubleHeadModel docstring --- pytorch_transformers/modeling_gpt2.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index 72b7a7df6f..b7872a36d2 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -568,8 +568,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): Examples:: + import torch + from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] @@ -684,12 +688,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') - tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) - model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings to the new vocabulary size (add a vector at the end) + + # Add a [CLS] to the vocabulary (we should train it also!) + tokenizer.add_special_tokens({'cls_token': '[CLS]'}) + model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size + print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary + choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] - input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices - mc_token_ids = torch.tensor([input_ids.size(-1)]) # Batch size 1 - outputs = model(input_ids, mc_token_ids) + encoded_choices = [tokenizer.encode(s) for s in choices] + cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] + + input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 + mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 + + outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ From abe734ca1fe8a618ffe5af61e029c18d4c1f0d8b Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 12:20:18 +0200 Subject: [PATCH 10/17] fix GPT-2 and RoBERTa tests to be clean now --- .../tests/tokenization_gpt2_test.py | 17 +++++++++-------- .../tests/tokenization_roberta_test.py | 17 +++++++++-------- .../tests/tokenization_tests_commons.py | 4 ++-- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py index da7028c27d..8ba3be7e5d 100644 --- a/pytorch_transformers/tests/tokenization_gpt2_test.py +++ b/pytorch_transformers/tests/tokenization_gpt2_test.py @@ -31,17 +31,18 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", - "lo", "low", "er", - "low", "lowest", "newer", "wider", ""] + "\u0120", "\u0120l", "\u0120n", + "\u0120lo", "\u0120low", "er", + "\u0120lowest", "\u0120newer", "\u0120wider", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - merges = ["#version: 0.2", "l o", "lo w", "e r", ""] + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] self.special_tokens_map = {"unk_token": ""} self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) - with open(self.vocab_file, "w") as fp: + with open(self.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.merges_file, "w") as fp: + with open(self.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) def get_tokenizer(self): @@ -49,18 +50,18 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): def get_input_output_texts(self): input_text = u"lower newer" - output_text = u"lowernewer" + output_text = u" lower newer" return input_text, output_text def test_full_tokenizer(self): tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "lower" - bpe_tokens = ["low", "er"] + bpe_tokens = ["\u0120low", "er"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] - input_bpe_tokens = [13, 12, 17] + input_bpe_tokens = [14, 15, 19] self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py index a8f940ae43..960a91a5e1 100644 --- a/pytorch_transformers/tests/tokenization_roberta_test.py +++ b/pytorch_transformers/tests/tokenization_roberta_test.py @@ -30,17 +30,18 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", - "lo", "low", "er", - "low", "lowest", "newer", "wider", ""] + "\u0120", "\u0120l", "\u0120n", + "\u0120lo", "\u0120low", "er", + "\u0120lowest", "\u0120newer", "\u0120wider", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - merges = ["#version: 0.2", "l o", "lo w", "e r", ""] + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] self.special_tokens_map = {"unk_token": ""} self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) - with open(self.vocab_file, "w") as fp: + with open(self.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.merges_file, "w") as fp: + with open(self.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) def get_tokenizer(self): @@ -48,18 +49,18 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): def get_input_output_texts(self): input_text = u"lower newer" - output_text = u"lowernewer" + output_text = u" lower newer" return input_text, output_text def test_full_tokenizer(self): tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "lower" - bpe_tokens = ["low", "er"] + bpe_tokens = ["\u0120low", "er"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] - input_bpe_tokens = [13, 12, 17] + input_bpe_tokens = [14, 15, 19] self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py index 8d94355bea..11db292313 100644 --- a/pytorch_transformers/tests/tokenization_tests_commons.py +++ b/pytorch_transformers/tests/tokenization_tests_commons.py @@ -111,7 +111,7 @@ class CommonTestCases: self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) new_toks_2 = {'eos_token': ">>>>|||<||<<|<<", - 'pad_token': "<<<<<|||>|>>>>|>"} + 'pad_token': "<<<<<|||>|>>>>|>"} added_toks_2 = tokenizer.add_special_tokens(new_toks_2) vocab_size_3 = tokenizer.vocab_size all_size_3 = len(tokenizer) @@ -129,7 +129,7 @@ class CommonTestCases: self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokens[-3]) self.assertEqual(tokens[0], tokenizer.eos_token_id) - self.assertEqual(tokens[-2], tokenizer.eos_token_id) + self.assertEqual(tokens[-2], tokenizer.pad_token_id) def test_required_methods_tokenizer(self): From fd10d79b55d159d845a30adb238cd7019965aa23 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 12:23:12 +0200 Subject: [PATCH 11/17] update GPT2 docstring --- pytorch_transformers/tokenization_gpt2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index eb56e7303e..1fa7cbd06b 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -99,7 +99,10 @@ def get_pairs(word): class GPT2Tokenizer(PreTrainedTokenizer): """ GPT-2 BPE tokenizer. Peculiarities: - - Byte-level BPE + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => will add a space is there isn't. + As a consequence, this tokenizer `encode` and `decode` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello" """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP From 4e6a3172cecef53f790f1c995c7569ca11e04444 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 12:23:37 +0200 Subject: [PATCH 12/17] update roberta docstring as well --- pytorch_transformers/tokenization_roberta.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index 26805d9f4e..f290168c95 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -64,7 +64,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { class RobertaTokenizer(GPT2Tokenizer): """ - RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE + RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => will add a space is there isn't. + As a consequence, this tokenizer `encode` and `decode` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello" """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP From ca1a00a302c6aff525d949d398ee6bfe42e3e194 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 12:29:31 +0200 Subject: [PATCH 13/17] fix for python2 --- pytorch_transformers/tests/tokenization_gpt2_test.py | 5 +++-- pytorch_transformers/tests/tokenization_roberta_test.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py index 8ba3be7e5d..3e4fb5bc1d 100644 --- a/pytorch_transformers/tests/tokenization_gpt2_test.py +++ b/pytorch_transformers/tests/tokenization_gpt2_test.py @@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest import json +from io import open from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES @@ -55,8 +56,8 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): def test_full_tokenizer(self): tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) - text = "lower" - bpe_tokens = ["\u0120low", "er"] + text = "lower newer" + bpe_tokens = ["\u0120low", "er", "\u0120newer"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py index 960a91a5e1..e2082e7613 100644 --- a/pytorch_transformers/tests/tokenization_roberta_test.py +++ b/pytorch_transformers/tests/tokenization_roberta_test.py @@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import json import unittest +from io import open from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES from .tokenization_tests_commons import CommonTestCases From 5dd7b677adbd2a228328e42b79583143c16b8dff Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 12:43:08 +0200 Subject: [PATCH 14/17] clean up all byte-level bpe tests --- pytorch_transformers/tests/tokenization_gpt2_test.py | 4 ++-- pytorch_transformers/tests/tokenization_roberta_test.py | 6 +++--- pytorch_transformers/tokenization_gpt2.py | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py index 3e4fb5bc1d..fbeaa2a6df 100644 --- a/pytorch_transformers/tests/tokenization_gpt2_test.py +++ b/pytorch_transformers/tests/tokenization_gpt2_test.py @@ -57,12 +57,12 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): def test_full_tokenizer(self): tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "lower newer" - bpe_tokens = ["\u0120low", "er", "\u0120newer"] + bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] - input_bpe_tokens = [14, 15, 19] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py index e2082e7613..e1e7f45efd 100644 --- a/pytorch_transformers/tests/tokenization_roberta_test.py +++ b/pytorch_transformers/tests/tokenization_roberta_test.py @@ -55,13 +55,13 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): def test_full_tokenizer(self): tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) - text = "lower" - bpe_tokens = ["\u0120low", "er"] + text = "lower newer" + bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] - input_bpe_tokens = [14, 15, 19] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index 1fa7cbd06b..8a9ade87e1 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -64,13 +64,14 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { @lru_cache() def bytes_to_unicode(): """ - Returns list of utf-8 byte and a corresponding list of unicode strings. + Returns list of utf-8 byte and a mapping to unicode strings. + We specifically avoids mapping to whitespace/control characters the bpe code barfs on. + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. """ _chr = unichr if sys.version_info[0] == 2 else chr bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) @@ -176,9 +177,9 @@ class GPT2Tokenizer(PreTrainedTokenizer): bpe_tokens = [] for token in re.findall(self.pat, text): if sys.version_info[0] == 2: - token = ''.join(self.byte_encoder[ord(b)] for b in token) + token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) else: - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens From ce5ef4b35d5c81813224424dbb71ec8889155bcd Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 13:22:43 +0200 Subject: [PATCH 15/17] python2 doesn't spark joy --- pytorch_transformers/tests/tokenization_gpt2_test.py | 2 +- pytorch_transformers/tests/tokenization_roberta_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py index fbeaa2a6df..c7e24e8364 100644 --- a/pytorch_transformers/tests/tokenization_gpt2_test.py +++ b/pytorch_transformers/tests/tokenization_gpt2_test.py @@ -42,7 +42,7 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) with open(self.vocab_file, "w", encoding="utf-8") as fp: - fp.write(json.dumps(vocab_tokens)) + fp.write(json.dumps(vocab_tokens) + "\n") with open(self.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py index e1e7f45efd..45e45d645c 100644 --- a/pytorch_transformers/tests/tokenization_roberta_test.py +++ b/pytorch_transformers/tests/tokenization_roberta_test.py @@ -41,7 +41,7 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) with open(self.vocab_file, "w", encoding="utf-8") as fp: - fp.write(json.dumps(vocab_tokens)) + fp.write(json.dumps(vocab_tokens) + "\n") with open(self.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) From 8faf2e086b4ebefda03feee98f0d5a65238658af Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 13:36:22 +0200 Subject: [PATCH 16/17] more doc on special tokens --- pytorch_transformers/tokenization_utils.py | 23 ++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 1116735d0e..7c66588cc7 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -43,21 +43,21 @@ class PreTrainedTokenizer(object): Parameters: - - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` + - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` - - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` + - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` - - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` + - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` - - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` + - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` - - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` + - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` - - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` + - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` - - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` + - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` - - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` + - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` """ vocab_files_names = {} pretrained_vocab_files_map = {} @@ -494,6 +494,13 @@ class PreTrainedTokenizer(object): to class attributes. If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary). + Using `add_special_tokens` will ensure your special tokens can be used in several ways: + + - special tokens are carefully handled by the tokenizer (they are never split) + - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. + + When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') + Args: special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, From f8aace6bcd1f72ba962263be3de6876572a366a5 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 13:39:52 +0200 Subject: [PATCH 17/17] update tokenizers to use self.XX_token_id instead of converting self.XX_token --- pytorch_transformers/tokenization_bert.py | 6 +++--- pytorch_transformers/tokenization_roberta.py | 6 +++--- pytorch_transformers/tokenization_xlm.py | 6 +++--- pytorch_transformers/tokenization_xlnet.py | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index 04f35aa466..434898d1aa 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -171,15 +171,15 @@ class BertTokenizer(PreTrainedTokenizer): Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP] """ - return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] + return [self.cls_token_id] + token_ids + [self.sep_token_id] def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def save_vocabulary(self, vocab_path): diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index f290168c95..7c8b3587a1 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -86,13 +86,13 @@ class RobertaTokenizer(GPT2Tokenizer): Adds special tokens to a sequence for sequence classification tasks. A RoBERTa sequence has the following format: [CLS] X [SEP] """ - return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] + return [self.cls_token_id] + token_ids + [self.sep_token_id] def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. A RoBERTa sequence pair has the following format: [CLS] A [SEP][SEP] B [SEP] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index 2d2f3a8cd4..ae9fe6c828 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -220,15 +220,15 @@ class XLMTokenizer(PreTrainedTokenizer): Adds special tokens to a sequence for sequence classification tasks. An XLM sequence has the following format: [CLS] X [SEP] """ - return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] + return [self.cls_token_id] + token_ids + [self.sep_token_id] def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def save_vocabulary(self, save_directory): diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py index 371b3c9407..b479a2832d 100644 --- a/pytorch_transformers/tokenization_xlnet.py +++ b/pytorch_transformers/tokenization_xlnet.py @@ -182,8 +182,8 @@ class XLNetTokenizer(PreTrainedTokenizer): Adds special tokens to a sequence pair for sequence classification tasks. An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return token_ids + sep + cls def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): @@ -191,8 +191,8 @@ class XLNetTokenizer(PreTrainedTokenizer): Adds special tokens to a sequence for sequence classification tasks. An XLNet sequence has the following format: X [SEP][CLS] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls def save_vocabulary(self, save_directory):