From f872eb98c2b046c9f806b8cf0196d48e77c17899 Mon Sep 17 00:00:00 2001 From: dhanajitb Date: Thu, 28 Mar 2019 22:46:15 +0530 Subject: [PATCH 01/47] making unconditional generation work The unconditional generation works now but if the seed is fixed, the sample is the same every time. n_samples > 1 will give different samples though. I am giving the start token as '<|endoftext|>' for the unconditional generation. --- examples/run_gpt2.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py index 0350747499..0289b26702 100644 --- a/examples/run_gpt2.py +++ b/examples/run_gpt2.py @@ -106,6 +106,23 @@ def run_model(): print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80) + if args.unconditional: + generated = 0 + for _ in range(args.nsamples // args.batch_size): + out = sample_sequence( + model=model, length=args.length, + context=None, + start_token=enc.encoder['<|endoftext|>'], + batch_size=args.batch_size, + temperature=args.temperature, top_k=args.top_k, device=device + ) + out = out[:,1:].tolist() + for i in range(args.batch_size): + generated += 1 + text = enc.decode(out[i]) + print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) + print(text) + print("=" * 80) if __name__ == '__main__': run_model() From 0d6a882f63ddc1726e43efee0151b5bce3d67eb2 Mon Sep 17 00:00:00 2001 From: dhanajitb Date: Sun, 7 Apr 2019 16:54:38 +0530 Subject: [PATCH 02/47] Cleaned some redundant lines ```while not args.unconditional: if not args.unconditional: ``` These lines have been updated --- examples/run_gpt2.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py index 0289b26702..f9a1962d26 100644 --- a/examples/run_gpt2.py +++ b/examples/run_gpt2.py @@ -83,29 +83,29 @@ def run_model(): elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) - while not args.unconditional: - if not args.unconditional: + if not args.unconditional: + while True: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text) - generated = 0 - for _ in range(args.nsamples // args.batch_size): - out = sample_sequence( - model=model, length=args.length, - context=context_tokens if not args.unconditional else None, - start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, - batch_size=args.batch_size, - temperature=args.temperature, top_k=args.top_k, device=device - ) - out = out[:, len(context_tokens):].tolist() - for i in range(args.batch_size): - generated += 1 - text = enc.decode(out[i]) - print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) - print(text) - print("=" * 80) + generated = 0 + for _ in range(args.nsamples // args.batch_size): + out = sample_sequence( + model=model, length=args.length, + context=context_tokens, + start_token=None, + batch_size=args.batch_size, + temperature=args.temperature, top_k=args.top_k, device=device + ) + out = out[:, len(context_tokens):].tolist() + for i in range(args.batch_size): + generated += 1 + text = enc.decode(out[i]) + print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) + print(text) + print("=" * 80) if args.unconditional: generated = 0 for _ in range(args.nsamples // args.batch_size): From 4d3cf0d6028d7576b8c51ba1eda8403e86b42b05 Mon Sep 17 00:00:00 2001 From: Dhanajit Brahma Date: Sun, 7 Apr 2019 16:59:07 +0530 Subject: [PATCH 03/47] removing some redundant lines --- examples/run_gpt2.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py index 0289b26702..b22df39b98 100644 --- a/examples/run_gpt2.py +++ b/examples/run_gpt2.py @@ -83,29 +83,29 @@ def run_model(): elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) - while not args.unconditional: - if not args.unconditional: + if not args.unconditional: + while True: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text) - generated = 0 - for _ in range(args.nsamples // args.batch_size): - out = sample_sequence( - model=model, length=args.length, - context=context_tokens if not args.unconditional else None, - start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None, - batch_size=args.batch_size, - temperature=args.temperature, top_k=args.top_k, device=device - ) - out = out[:, len(context_tokens):].tolist() - for i in range(args.batch_size): - generated += 1 - text = enc.decode(out[i]) - print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) - print(text) - print("=" * 80) + generated = 0 + for _ in range(args.nsamples // args.batch_size): + out = sample_sequence( + model=model, length=args.length, + context=context_tokens, + start_token=None, + batch_size=args.batch_size, + temperature=args.temperature, top_k=args.top_k, device=device + ) + out = out[:, len(context_tokens):].tolist() + for i in range(args.batch_size): + generated += 1 + text = enc.decode(out[i]) + print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) + print(text) + print("=" * 80) if args.unconditional: generated = 0 for _ in range(args.nsamples // args.batch_size): @@ -127,3 +127,4 @@ def run_model(): if __name__ == '__main__': run_model() + From fd8a3556f08bbcfb9c4f3eadea6206751c1b1dd9 Mon Sep 17 00:00:00 2001 From: Benjamin Mann <8enmann@gmail.com> Date: Mon, 8 Apr 2019 17:20:35 -0700 Subject: [PATCH 04/47] fix run_gpt2.py --- examples/run_gpt2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py index 0350747499..a30c6c6456 100644 --- a/examples/run_gpt2.py +++ b/examples/run_gpt2.py @@ -83,7 +83,8 @@ def run_model(): elif args.length > model.config.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx) - while not args.unconditional: + while True: + context_tokens = [] if not args.unconditional: raw_text = input("Model prompt >>> ") while not raw_text: @@ -106,6 +107,8 @@ def run_model(): print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80) + if args.unconditional: + break if __name__ == '__main__': run_model() From 8fffba5f475d175dfc9246a5ebc0b99fde115221 Mon Sep 17 00:00:00 2001 From: Yaroslav Bulatov Date: Tue, 9 Apr 2019 14:45:47 -0700 Subject: [PATCH 05/47] Update README.md Fix for ```> > > > 04/09/2019 21:39:38 - INFO - __main__ - device: cuda n_gpu: 1, distributed training: False, 16-bits training: False Traceback (most recent call last): File "/home/ubuntu/pytorch-pretrained-BERT/examples/lm_finetuning/simple_lm_finetuning.py", line 642, in main() File "/home/ubuntu/pytorch-pretrained-BERT/examples/lm_finetuning/simple_lm_finetuning.py", line 502, in main raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.") ValueError: Training is currently the only implemented execution option. Please set `do_train`. ``` --- examples/lm_finetuning/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/lm_finetuning/README.md b/examples/lm_finetuning/README.md index c48d9b7069..f04e877ef2 100644 --- a/examples/lm_finetuning/README.md +++ b/examples/lm_finetuning/README.md @@ -37,6 +37,7 @@ python3 simple_lm_finetuning.py --bert_model bert-base-uncased --do_lower_case --output_dir finetuned_lm/ +--do_train ``` ### Pregenerating training data @@ -60,4 +61,4 @@ python3 finetune_on_pregenerated.py --do_lower_case --output_dir finetuned_lm/ --epochs 3 -``` \ No newline at end of file +``` From e99b2014ccaa4a19846ccb5191e63b4bfdb1baa6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Apr 2019 11:43:13 +0200 Subject: [PATCH 06/47] fixes #471 --- pytorch_pretrained_bert/modeling_openai.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 296abbfc31..7bf643675e 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -371,8 +371,8 @@ class OpenAIGPTMultipleChoiceHead(nn.Module): def forward(self, hidden_states, mc_token_ids): # Classification logits # hidden_state (bsz, num_choices, seq_length, hidden_size) - # mc_token_ids (bsz, num_choices) - mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) + # mc_token_ids (bsz, num_choices, 1) + mc_token_ids = mc_token_ids.unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) # (bsz, num_choices, 1, hidden_size) multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2) # (bsz, num_choices, hidden_size) @@ -605,14 +605,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): return # Update config self.config.n_special = num_special_tokens - # # Build new embeddings and initialize + # Build new embeddings and initialize all new embeddings (in particular the special tokens) old_embed = self.tokens_embed self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) - # Initialize all new embeddings (in particular the special tokens) self.init_weights(self.tokens_embed) - # Copy word and positional embeddings from the previous weights - self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :] - self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :] + # Copy word embeddings from the previous weights + self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] def forward(self, input_ids, position_ids=None, token_type_ids=None): if position_ids is None: From 4a82f4f85685c22b995108909485d822f3e3c607 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Apr 2019 13:11:22 +0200 Subject: [PATCH 07/47] update special token addition --- pytorch_pretrained_bert/modeling_openai.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index fb3d0cadb7..feae95d962 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -608,6 +608,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): # Build new embeddings and initialize all new embeddings (in particular the special tokens) old_embed = self.tokens_embed self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) + self.tokens_embed.to(old_embed.device.weight.device) self.init_weights(self.tokens_embed) # Copy word embeddings from the previous weights self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] From a05fad8dcee87087368ad996fe2d76599b406e34 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Apr 2019 13:16:17 +0200 Subject: [PATCH 08/47] fix typo --- pytorch_pretrained_bert/modeling_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index feae95d962..1a2a3feb20 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -608,7 +608,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): # Build new embeddings and initialize all new embeddings (in particular the special tokens) old_embed = self.tokens_embed self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) - self.tokens_embed.to(old_embed.device.weight.device) + self.tokens_embed.to(old_embed.weight.device) self.init_weights(self.tokens_embed) # Copy word embeddings from the previous weights self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] From 4bc4c69af92ceef82b1c9df126cc14f0eb7033e8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Apr 2019 16:57:59 +0200 Subject: [PATCH 09/47] finetuning any BERT model - fixes #455 --- examples/lm_finetuning/finetune_on_pregenerated.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 035f97b0c9..6a63324502 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -123,9 +123,8 @@ def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) - parser.add_argument("--bert_model", type=str, required=True, - choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", - "bert-base-multilingual", "bert-base-chinese"]) + parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " + "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") From 724eb45cef001bf8b73ada5c80494c58c361ef24 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Apr 2019 17:12:00 +0200 Subject: [PATCH 10/47] add stale bot --- .github/stale.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/stale.yml diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 0000000000..d9f6563218 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,17 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 60 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 7 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned + - security +# Label to use when marking an issue as stale +staleLabel: wontfix +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: false \ No newline at end of file From 074c869bbebd9ad1b8ec1c52ecc506ba982e8483 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Apr 2019 20:53:50 +0200 Subject: [PATCH 11/47] fix OpenAIGPTMultipleChoiceHead --- pytorch_pretrained_bert/modeling_openai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index fb3d0cadb7..b6252d097f 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -371,8 +371,8 @@ class OpenAIGPTMultipleChoiceHead(nn.Module): def forward(self, hidden_states, mc_token_ids): # Classification logits # hidden_state (bsz, num_choices, seq_length, hidden_size) - # mc_token_ids (bsz, num_choices, 1) - mc_token_ids = mc_token_ids.unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) + # mc_token_ids (bsz, num_choices) + mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) # (bsz, num_choices, 1, hidden_size) multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2) # (bsz, num_choices, hidden_size) From c49ce3c722c35324803e40efb88b1a3057c7f249 Mon Sep 17 00:00:00 2001 From: Jie Yang Date: Thu, 11 Apr 2019 15:40:19 -0400 Subject: [PATCH 12/47] fix tsv read error in Windows --- examples/run_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 751d581ad9..4268c41ec6 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -95,7 +95,7 @@ class DataProcessor(object): @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" - with open(input_file, "r") as f: + with open(input_file, "r", encoding="utf-8") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: From 1d203a34c06fb8b2c1de856d58950f9d193cc1fc Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Apr 2019 23:51:03 +0200 Subject: [PATCH 13/47] back to simple indexing --- pytorch_pretrained_bert/modeling_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 1a2a3feb20..be4f959485 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -372,7 +372,7 @@ class OpenAIGPTMultipleChoiceHead(nn.Module): # Classification logits # hidden_state (bsz, num_choices, seq_length, hidden_size) # mc_token_ids (bsz, num_choices, 1) - mc_token_ids = mc_token_ids.unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) + mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) # (bsz, num_choices, 1, hidden_size) multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2) # (bsz, num_choices, hidden_size) From b509bf765574852648020d60690386b80e970cf6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 12 Apr 2019 12:12:33 +0200 Subject: [PATCH 14/47] updating loss computation --- pytorch_pretrained_bert/modeling_openai.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index be4f959485..c4d20c331e 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -716,9 +716,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): lm_logits = self.lm_head(hidden_states) if lm_labels is not None: # Shift so that tokens < n predict n - shift_logits = lm_logits[:, :-1].contiguous() - shift_labels = lm_labels[:, 1:].contiguous() - + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), @@ -808,11 +807,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: - shift_logits = lm_logits[:, :-1].contiguous() - shift_labels = lm_labels[:, 1:].contiguous() + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) - losses.append(loss_fct(shift_logits.view(-1, - shift_logits.size(-1)), shift_labels.view(-1))) + losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) if mc_labels is not None: loss_fct = CrossEntropyLoss() losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) From dbbd6c7500dded778706326c7a1e402cffe97eb8 Mon Sep 17 00:00:00 2001 From: Matthew Carrigan Date: Fri, 12 Apr 2019 15:07:58 +0100 Subject: [PATCH 15/47] Replaced some randints with cleaner randranges, and added a helpful error for users whose corpus is just one giant document. --- .../pregenerate_training_data.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index 8cc28d2e78..e6c3598a9f 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -4,7 +4,7 @@ from tqdm import tqdm, trange from tempfile import TemporaryDirectory import shelve -from random import random, randint, shuffle, choice, sample +from random import random, randrange, randint, shuffle, choice, sample from pytorch_pretrained_bert.tokenization import BertTokenizer import numpy as np import json @@ -30,6 +30,8 @@ class DocumentDatabase: self.reduce_memory = reduce_memory def add_document(self, document): + if not document: + return if self.reduce_memory: current_idx = len(self.doc_lengths) self.document_shelf[str(current_idx)] = document @@ -49,11 +51,11 @@ class DocumentDatabase: self._precalculate_doc_weights() rand_start = self.doc_cumsum[current_idx] rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] - sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max + sentence_index = randrange(rand_start, rand_end) % self.cumsum_max sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') else: # If we don't use sentence weighting, then every doc has an equal chance to be chosen - sampled_doc_index = current_idx + randint(1, len(self.doc_lengths)-1) + sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths) assert sampled_doc_index != current_idx if self.reduce_memory: return self.document_shelf[str(sampled_doc_index)] @@ -170,7 +172,7 @@ def create_instances_from_document( # (first) sentence. a_end = 1 if len(current_chunk) >= 2: - a_end = randint(1, len(current_chunk) - 1) + a_end = randrange(1, len(current_chunk)) tokens_a = [] for j in range(a_end): @@ -186,7 +188,7 @@ def create_instances_from_document( # Sample a random document, with longer docs being sampled more frequently random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True) - random_start = randint(0, len(random_document) - 1) + random_start = randrange(0, len(random_document)) for j in range(random_start, len(random_document)): tokens_b.extend(random_document[j]) if len(tokens_b) >= target_b_length: @@ -264,6 +266,14 @@ def main(): else: tokens = tokenizer.tokenize(line) doc.append(tokens) + if doc: + docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added + if len(docs) <= 1: + exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to " + "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " + "indicate breaks between documents in your input file. If your dataset does not contain multiple " + "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " + "sections or paragraphs.") args.output_dir.mkdir(exist_ok=True) for epoch in trange(args.epochs_to_generate, desc="Epoch"): From 34cf67fd6c3690bdc02d15cbc44da272b938c330 Mon Sep 17 00:00:00 2001 From: Martin Boyanov Date: Fri, 12 Apr 2019 21:30:28 +0300 Subject: [PATCH 16/47] Extend the BertForSequenceClassification docs to mention the special CLS token. --- pytorch_pretrained_bert/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index 2736e34d7f..037c6e9723 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -930,7 +930,7 @@ class BertForSequenceClassification(BertPreTrainedModel): Inputs: `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts `extract_features.py`, `run_classifier.py` and `run_squad.py`) `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to From fe2756ff41147ea6de14d8f81ecc5304382af91d Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 10:04:05 +0200 Subject: [PATCH 17/47] update double head model --- pytorch_pretrained_bert/modeling_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index c4d20c331e..7b95d74f7c 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -371,7 +371,7 @@ class OpenAIGPTMultipleChoiceHead(nn.Module): def forward(self, hidden_states, mc_token_ids): # Classification logits # hidden_state (bsz, num_choices, seq_length, hidden_size) - # mc_token_ids (bsz, num_choices, 1) + # mc_token_ids (bsz, num_choices) mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) # (bsz, num_choices, 1, hidden_size) multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2) From 3e65f255dcaf8cac7dabf11adc318756dc5664bb Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 11:47:25 +0200 Subject: [PATCH 18/47] add serialization semantics to tokenizers - fix transfo-xl tokenizer --- examples/run_transfo_xl.py | 3 +- pytorch_pretrained_bert/tokenization.py | 13 ++ pytorch_pretrained_bert/tokenization_gpt2.py | 16 +++ .../tokenization_openai.py | 16 +++ .../tokenization_transfo_xl.py | 129 +++--------------- 5 files changed, 67 insertions(+), 110 deletions(-) diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py index 8139f28baf..0ea7b32053 100644 --- a/examples/run_transfo_xl.py +++ b/examples/run_transfo_xl.py @@ -28,7 +28,7 @@ import math import torch -from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus +from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', @@ -80,6 +80,7 @@ def main(): # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax # and tokenizing the dataset # The pre-processed corpus is a convertion (using the conversion script ) + tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name) corpus = TransfoXLCorpus.from_pretrained(args.model_name) ntokens = len(corpus.vocab) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index bbb3e25fc7..6e2e11ed92 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -134,6 +134,19 @@ class BertTokenizer(object): tokens.append(self.ids_to_tokens[i]) return tokens + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a path.""" + index = 0 + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index db95719dbc..07db995b96 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -187,6 +187,22 @@ class GPT2Tokenizer(object): self.cache[token] = word return word + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a path.""" + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + merge_file = os.path.join(vocab_path, MERGES_NAME) + json.dump(self.encoder, vocab_file) + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(bpe_tokens + u'\n') + index += 1 + def encode(self, text): bpe_tokens = [] for token in re.findall(self.pat, text): diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index 240122d12d..aa0438ccf8 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -261,3 +261,19 @@ class OpenAIGPTTokenizer(object): ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m " ).replace(" 've", "'ve") return out_string + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a path.""" + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + merge_file = os.path.join(vocab_path, MERGES_NAME) + json.dump(self.encoder, vocab_file) + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(bpe_tokens + u'\n') + index += 1 diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py index b5360c5184..b6470c7667 100644 --- a/pytorch_pretrained_bert/tokenization_transfo_xl.py +++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py @@ -63,7 +63,10 @@ class TransfoXLTokenizer(object): if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] else: - vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) + if os.path.isdir(pretrained_model_name_or_path): + vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) + else: + vocab_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) @@ -141,6 +144,11 @@ class TransfoXLTokenizer(object): else: raise ValueError('No token in vocabulary') + def save_vocabulary(self, vocab_path): + index = 0 + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + torch.save(self.__dict__, vocab_file) + def build_vocab(self): if self.vocab_file: print('building vocab from {}'.format(self.vocab_file)) @@ -245,82 +253,24 @@ class TransfoXLTokenizer(object): def __len__(self): return len(self.idx2sym) - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - if text in self.never_split: - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - def whitespace_tokenize(self, text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - if self.delimiter == '': - tokens = text - else: - tokens = text.split(self.delimiter) - return tokens - def tokenize(self, line, add_eos=False, add_double_eos=False): - line = self._clean_text(line) line = line.strip() + # convert to lower case + if self.lower_case: + line = line.lower() - symbols = self.whitespace_tokenize(line) - - split_symbols = [] - for symbol in symbols: - if self.lower_case and symbol not in self.never_split: - symbol = symbol.lower() - symbol = self._run_strip_accents(symbol) - split_symbols.extend(self._run_split_on_punc(symbol)) + # empty delimiter '' will evaluate False + if self.delimiter == '': + symbols = line + else: + symbols = line.split(self.delimiter) if add_double_eos: # lm1b - return [''] + split_symbols + [''] + return [''] + symbols + [''] elif add_eos: - return split_symbols + [''] + return symbols + [''] else: - return split_symbols + return symbols class LMOrderedIterator(object): @@ -631,42 +581,3 @@ def get_lm_corpus(datadir, dataset): torch.save(corpus, fn) return corpus - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False - - -def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False From 870b734bfd2cc83e43b29050fba03709a0c5b539 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 12:03:56 +0200 Subject: [PATCH 19/47] added tokenizers serialization tests --- pytorch_pretrained_bert/tokenization.py | 1 + pytorch_pretrained_bert/tokenization_gpt2.py | 6 ++- .../tokenization_openai.py | 6 ++- .../tokenization_transfo_xl.py | 1 + tests/tokenization_openai_test.py | 16 +++++++ tests/tokenization_test.py | 11 +++++ tests/tokenization_transfo_xl_test.py | 42 ++++++------------- 7 files changed, 51 insertions(+), 32 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index 6e2e11ed92..8fd65f55f0 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -146,6 +146,7 @@ class BertTokenizer(object): index = token_index writer.write(token + u'\n') index += 1 + return vocab_file @classmethod def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 07db995b96..b49e1310e4 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -188,7 +188,10 @@ class GPT2Tokenizer(object): return word def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a path.""" + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(vocab_path): + logger.error("Vocabulary path ({}) should be a directory".format(vocab_path)) + return vocab_file = os.path.join(vocab_path, VOCAB_NAME) merge_file = os.path.join(vocab_path, MERGES_NAME) json.dump(self.encoder, vocab_file) @@ -202,6 +205,7 @@ class GPT2Tokenizer(object): index = token_index writer.write(bpe_tokens + u'\n') index += 1 + return vocab_file, merge_file def encode(self, text): bpe_tokens = [] diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index aa0438ccf8..f3ce7ab251 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -263,7 +263,10 @@ class OpenAIGPTTokenizer(object): return out_string def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a path.""" + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(vocab_path): + logger.error("Vocabulary path ({}) should be a directory".format(vocab_path)) + return vocab_file = os.path.join(vocab_path, VOCAB_NAME) merge_file = os.path.join(vocab_path, MERGES_NAME) json.dump(self.encoder, vocab_file) @@ -277,3 +280,4 @@ class OpenAIGPTTokenizer(object): index = token_index writer.write(bpe_tokens + u'\n') index += 1 + return vocab_file, merge_file diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py index b6470c7667..f704a035db 100644 --- a/pytorch_pretrained_bert/tokenization_transfo_xl.py +++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py @@ -148,6 +148,7 @@ class TransfoXLTokenizer(object): index = 0 vocab_file = os.path.join(vocab_path, VOCAB_NAME) torch.save(self.__dict__, vocab_file) + return vocab_file def build_vocab(self): if self.vocab_file: diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py index 6213eb1b03..2b1bdd3a9a 100644 --- a/tests/tokenization_openai_test.py +++ b/tests/tokenization_openai_test.py @@ -52,5 +52,21 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + vocab_file, merges_file = tokenizer.save_vocabulary(vocab_path="/tmp/") + tokenizer.from_pretrained("/tmp/") + os.remove(vocab_file) + os.remove(merges_file) + + text = "lower" + bpe_tokens = ["low", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [""] + input_bpe_tokens = [14, 15, 20] + self.assertListEqual( + tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + if __name__ == '__main__': unittest.main() diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py index 78e145ffd2..15cc7ccd82 100644 --- a/tests/tokenization_test.py +++ b/tests/tokenization_test.py @@ -46,6 +46,17 @@ class TokenizationTest(unittest.TestCase): self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/") + tokenizer.from_pretrained(vocab_file) + os.remove(vocab_file) + + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_chinese(self): tokenizer = BasicTokenizer() diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py index 9ff04f5f34..add2eb4e71 100644 --- a/tests/tokenization_transfo_xl_test.py +++ b/tests/tokenization_transfo_xl_test.py @@ -18,9 +18,7 @@ import os import unittest from io import open -from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer, - _is_control, _is_punctuation, - _is_whitespace) +from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer class TransfoXLTokenizationTest(unittest.TestCase): @@ -43,6 +41,17 @@ class TransfoXLTokenizationTest(unittest.TestCase): self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) + vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/") + tokenizer.from_pretrained(vocab_file) + os.remove(vocab_file) + + tokens = tokenizer.tokenize(u" UNwant\u00E9d,running") + self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) + + def test_full_tokenizer_lower(self): tokenizer = TransfoXLTokenizer(lower_case=True) @@ -58,33 +67,6 @@ class TransfoXLTokenizationTest(unittest.TestCase): tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]) - def test_is_whitespace(self): - self.assertTrue(_is_whitespace(u" ")) - self.assertTrue(_is_whitespace(u"\t")) - self.assertTrue(_is_whitespace(u"\r")) - self.assertTrue(_is_whitespace(u"\n")) - self.assertTrue(_is_whitespace(u"\u00A0")) - - self.assertFalse(_is_whitespace(u"A")) - self.assertFalse(_is_whitespace(u"-")) - - def test_is_control(self): - self.assertTrue(_is_control(u"\u0005")) - - self.assertFalse(_is_control(u"A")) - self.assertFalse(_is_control(u" ")) - self.assertFalse(_is_control(u"\t")) - self.assertFalse(_is_control(u"\r")) - - def test_is_punctuation(self): - self.assertTrue(_is_punctuation(u"-")) - self.assertTrue(_is_punctuation(u"$")) - self.assertTrue(_is_punctuation(u"`")) - self.assertTrue(_is_punctuation(u".")) - - self.assertFalse(_is_punctuation(u"A")) - self.assertFalse(_is_punctuation(u" ")) - if __name__ == '__main__': unittest.main() From e8568a3b17454dd4e0b32b6cd80617aa662cc996 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 12:55:38 +0200 Subject: [PATCH 20/47] fixing tests --- pytorch_pretrained_bert/tokenization_gpt2.py | 27 ++++++++++++++++--- .../tokenization_openai.py | 27 ++++++++++++++++--- tests/tokenization_openai_test.py | 2 +- tests/tokenization_transfo_xl_test.py | 9 +++---- 4 files changed, 51 insertions(+), 14 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index b49e1310e4..ab80876ee5 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -45,6 +45,7 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { } VOCAB_NAME = 'vocab.json' MERGES_NAME = 'merges.txt' +SPECIAL_TOKENS_NAME = 'special_tokens.txt' @lru_cache() def bytes_to_unicode(): @@ -97,6 +98,11 @@ class GPT2Tokenizer(object): else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) + special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME) + if not os.path.exists(special_tokens_file): + special_tokens_file = None + else: + logger.info("loading special tokens file {}".format(special_tokens_file)) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) @@ -125,7 +131,11 @@ class GPT2Tokenizer(object): max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) + if special_tokens_file and 'special_tokens' not in kwargs: + special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1] + else: + special_tokens = kwargs.pop('special_tokens', []) + tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) return tokenizer def __init__(self, vocab_file, merges_file, errors='replace', max_len=None): @@ -194,7 +204,11 @@ class GPT2Tokenizer(object): return vocab_file = os.path.join(vocab_path, VOCAB_NAME) merge_file = os.path.join(vocab_path, MERGES_NAME) - json.dump(self.encoder, vocab_file) + special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write(u'#version: 0.2\n') @@ -203,9 +217,14 @@ class GPT2Tokenizer(object): logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file)) index = token_index - writer.write(bpe_tokens + u'\n') + writer.write(' '.join(bpe_tokens) + u'\n') index += 1 - return vocab_file, merge_file + + with open(special_tokens_file, 'w', encoding='utf-8') as writer: + for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]): + writer.write(token + u'\n') + + return vocab_file, merge_file, special_tokens_file def encode(self, text): bpe_tokens = [] diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index f3ce7ab251..d9713e51eb 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -41,6 +41,7 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { } VOCAB_NAME = 'vocab.json' MERGES_NAME = 'merges.txt' +SPECIAL_TOKENS_NAME = 'special_tokens.txt' def get_pairs(word): """ @@ -89,6 +90,11 @@ class OpenAIGPTTokenizer(object): else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) + special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME) + if not os.path.exists(special_tokens_file): + special_tokens_file = None + else: + logger.info("loading special tokens file {}".format(special_tokens_file)) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) @@ -117,7 +123,11 @@ class OpenAIGPTTokenizer(object): max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) + if special_tokens_file and 'special_tokens' not in kwargs: + special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1] + else: + special_tokens = kwargs.pop('special_tokens', []) + tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) return tokenizer def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): @@ -269,7 +279,11 @@ class OpenAIGPTTokenizer(object): return vocab_file = os.path.join(vocab_path, VOCAB_NAME) merge_file = os.path.join(vocab_path, MERGES_NAME) - json.dump(self.encoder, vocab_file) + special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write(u'#version: 0.2\n') @@ -278,6 +292,11 @@ class OpenAIGPTTokenizer(object): logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!".format(merge_file)) index = token_index - writer.write(bpe_tokens + u'\n') + writer.write(' '.join(bpe_tokens) + u'\n') index += 1 - return vocab_file, merge_file + + with open(special_tokens_file, 'w', encoding='utf-8') as writer: + for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]): + writer.write(token + u'\n') + + return vocab_file, merge_file, special_tokens_file diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py index 2b1bdd3a9a..1f695cfb12 100644 --- a/tests/tokenization_openai_test.py +++ b/tests/tokenization_openai_test.py @@ -52,7 +52,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) - vocab_file, merges_file = tokenizer.save_vocabulary(vocab_path="/tmp/") + vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/") tokenizer.from_pretrained("/tmp/") os.remove(vocab_file) os.remove(merges_file) diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py index add2eb4e71..1a805f11e6 100644 --- a/tests/tokenization_transfo_xl_test.py +++ b/tests/tokenization_transfo_xl_test.py @@ -35,7 +35,7 @@ class TransfoXLTokenizationTest(unittest.TestCase): tokenizer.build_vocab() os.remove(vocab_file) - tokens = tokenizer.tokenize(u" UNwant\u00E9d,running") + tokens = tokenizer.tokenize(u" UNwanted , running") self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) self.assertListEqual( @@ -45,7 +45,7 @@ class TransfoXLTokenizationTest(unittest.TestCase): tokenizer.from_pretrained(vocab_file) os.remove(vocab_file) - tokens = tokenizer.tokenize(u" UNwant\u00E9d,running") + tokens = tokenizer.tokenize(u" UNwanted , running") self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) self.assertListEqual( @@ -56,15 +56,14 @@ class TransfoXLTokenizationTest(unittest.TestCase): tokenizer = TransfoXLTokenizer(lower_case=True) self.assertListEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), ["hello", "!", "how", "are", "you", "?"]) - self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) def test_full_tokenizer_no_lower(self): tokenizer = TransfoXLTokenizer(lower_case=False) self.assertListEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]) From b17963d82ffa1355d222d3377594e61a25acd7aa Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 13:44:30 +0200 Subject: [PATCH 21/47] update readme --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index daac69de9f..1e192941f0 100644 --- a/README.md +++ b/README.md @@ -796,8 +796,7 @@ This model *outputs*: - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices] - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example). - -### Tokenizers: +### Tokenizers #### `BertTokenizer` @@ -816,6 +815,7 @@ and three methods: - `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization. - `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary. - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary. +- `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: `vocab_file_path`. The vocabulary can be reloaded with `BertTokenizer.from_pretrained('vocab_file_path')` or `BertTokenizer.from_pretrained('directory_path')`. Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing. @@ -837,6 +837,7 @@ and five methods: - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary. - `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments) - `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces. +- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`. Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`. @@ -844,6 +845,8 @@ Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch `TransfoXLTokenizer` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper ([Efficient softmax approximation for GPUs](http://arxiv.org/abs/1609.04309)) for more details. +The API is similar to the API of `BertTokenizer` (see above). + Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`. #### `GPT2Tokenizer` @@ -860,11 +863,11 @@ and two methods: - `encode(text)`: convert a `str` in a list of `int` tokens by performing byte-level BPE. - `decode(tokens)`: convert back a list of `int` tokens in a `str`. +- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`. Please refer to [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`. - -### Optimizers: +### Optimizers #### `BertAdam` From 9761aa48452712711d6b2ff04902b8a37ff294b3 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 14:12:08 +0200 Subject: [PATCH 22/47] add to_json_file method to configuration classes --- pytorch_pretrained_bert/modeling.py | 5 +++++ pytorch_pretrained_bert/modeling_gpt2.py | 5 +++++ pytorch_pretrained_bert/modeling_openai.py | 5 +++++ pytorch_pretrained_bert/modeling_transfo_xl.py | 5 +++++ tests/modeling_gpt2_test.py | 9 +++++++++ tests/modeling_openai_test.py | 9 +++++++++ tests/modeling_test.py | 9 +++++++++ tests/modeling_transfo_xl_test.py | 9 +++++++++ 8 files changed, 56 insertions(+) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index 2736e34d7f..6a71cbeea6 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -220,6 +220,11 @@ class BertConfig(object): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + try: from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm except ImportError: diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 7b00ce7730..fce564e9ea 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -180,6 +180,11 @@ class GPT2Config(object): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + class Conv1D(nn.Module): def __init__(self, nf, nx): diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index b6252d097f..33bb4472a5 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -225,6 +225,11 @@ class OpenAIGPTConfig(object): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + class Conv1D(nn.Module): def __init__(self, nf, rf, nx): diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py index ac895a03a7..0ba986f5b4 100644 --- a/pytorch_pretrained_bert/modeling_transfo_xl.py +++ b/pytorch_pretrained_bert/modeling_transfo_xl.py @@ -316,6 +316,11 @@ class TransfoXLConfig(object): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + class PositionalEmbedding(nn.Module): def __init__(self, demb): diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py index 12a539c44b..d542422060 100644 --- a/tests/modeling_gpt2_test.py +++ b/tests/modeling_gpt2_test.py @@ -16,6 +16,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import os import unittest import json import random @@ -176,6 +177,14 @@ class GPT2ModelTest(unittest.TestCase): self.assertEqual(obj["vocab_size"], 99) self.assertEqual(obj["n_embd"], 37) + def test_config_to_json_file(self): + config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37) + json_file_path = "/tmp/config.json" + config_first.to_json_file(json_file_path) + config_second = GPT2Config.from_json_file(json_file_path) + os.remove(json_file_path) + self.assertEqual(config_second.to_dict(), config_first.to_dict()) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() output_result = tester.create_gpt2_model(*config_and_inputs) diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py index 1cc8b7d5dc..db03bf792e 100644 --- a/tests/modeling_openai_test.py +++ b/tests/modeling_openai_test.py @@ -16,6 +16,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import os import unittest import json import random @@ -188,6 +189,14 @@ class OpenAIGPTModelTest(unittest.TestCase): self.assertEqual(obj["vocab_size"], 99) self.assertEqual(obj["n_embd"], 37) + def test_config_to_json_file(self): + config_first = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37) + json_file_path = "/tmp/config.json" + config_first.to_json_file(json_file_path) + config_second = OpenAIGPTConfig.from_json_file(json_file_path) + os.remove(json_file_path) + self.assertEqual(config_second.to_dict(), config_first.to_dict()) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() output_result = tester.create_openai_model(*config_and_inputs) diff --git a/tests/modeling_test.py b/tests/modeling_test.py index c7a031cfb0..02d7a13fda 100644 --- a/tests/modeling_test.py +++ b/tests/modeling_test.py @@ -16,6 +16,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import os import unittest import json import random @@ -251,6 +252,14 @@ class BertModelTest(unittest.TestCase): self.assertEqual(obj["vocab_size"], 99) self.assertEqual(obj["hidden_size"], 37) + def test_config_to_json_file(self): + config_first = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37) + json_file_path = "/tmp/config.json" + config_first.to_json_file(json_file_path) + config_second = BertConfig.from_json_file(json_file_path) + os.remove(json_file_path) + self.assertEqual(config_second.to_dict(), config_first.to_dict()) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() output_result = tester.create_bert_model(*config_and_inputs) diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py index 291d5d9d2a..a59d90b205 100644 --- a/tests/modeling_transfo_xl_test.py +++ b/tests/modeling_transfo_xl_test.py @@ -16,6 +16,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import os import unittest import json import random @@ -186,6 +187,14 @@ class TransfoXLModelTest(unittest.TestCase): self.assertEqual(obj["n_token"], 96) self.assertEqual(obj["d_embed"], 37) + def test_config_to_json_file(self): + config_first = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37) + json_file_path = "/tmp/config.json" + config_first.to_json_file(json_file_path) + config_second = TransfoXLConfig.from_json_file(json_file_path) + os.remove(json_file_path) + self.assertEqual(config_second.to_dict(), config_first.to_dict()) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() From 20577d8a7cb7dd38d3c5295c6f44bf377435e608 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 14:21:41 +0200 Subject: [PATCH 23/47] add configuration serialization to readme --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 1e192941f0..2a59bb0d37 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,7 @@ This package comprises the following classes that can be imported in Python and - Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files): - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files. - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files. + - `GPT2Config` - Configuration class to store the configuration of a `GPT2Model` with utilities to read and write from JSON configuration files. - `TransfoXLConfig` - Configuration class to store the configuration of a `TransfoXLModel` with utilities to read and write from JSON configuration files. The repository further comprises: @@ -524,6 +525,23 @@ model = GPT2Model.from_pretrained('gpt2') ``` +### Configuration classes + +Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are: + +- `BertConfig` for `BertModel` and BERT classes instances. +- `OpenAIGPTConfig` for `OpenAIGPTModel` and OpenAI GPT classes instances. +- `GPT2Config` for `GPT2Model` and OpenAI GPT-2 classes instances. +- `TransfoXLConfig` for `TransfoXLModel` and Transformer-XL classes instances. + +These configuration classes contains a few utilities to load and save configurations: + +- `from_dict(cls, json_object)`: A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class. +- `from_json_file(cls, json_file)`: A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class. +- `to_dict()`: Serializes an instance to a Python dictionary. Returns a dictionary. +- `to_json_string()`: Serializes an instance to a JSON string. Returns a string. +- `to_json_file(json_file_path)`: Save an instance to a json file. + ### PyTorch models #### 1. `BertModel` From b3c6ee0ac1cd95bcd0a54a36a29daf599f389f93 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 14:24:52 +0200 Subject: [PATCH 24/47] tokenization updates --- pytorch_pretrained_bert/tokenization.py | 5 +++-- pytorch_pretrained_bert/tokenization_transfo_xl.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index 8fd65f55f0..3937d6e011 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -135,9 +135,10 @@ class BertTokenizer(object): return tokens def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a path.""" + """Save the tokenizer vocabulary to a directory or file.""" index = 0 - vocab_file = os.path.join(vocab_path, VOCAB_NAME) + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_NAME) with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py index f704a035db..ddebc57c10 100644 --- a/pytorch_pretrained_bert/tokenization_transfo_xl.py +++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py @@ -145,8 +145,10 @@ class TransfoXLTokenizer(object): raise ValueError('No token in vocabulary') def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" index = 0 - vocab_file = os.path.join(vocab_path, VOCAB_NAME) + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_NAME) torch.save(self.__dict__, vocab_file) return vocab_file From 179a2c2ff66ebf147c562243d7f4b6f37c0cdd23 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 14:33:23 +0200 Subject: [PATCH 25/47] update example to work with new serialization semantic --- examples/run_classifier.py | 25 ++++++++++++++----------- examples/run_openai_gpt.py | 20 +++++++++++++------- examples/run_squad.py | 25 ++++++++++++++----------- examples/run_swag.py | 25 ++++++++++++++----------- 4 files changed, 55 insertions(+), 40 deletions(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 751d581ad9..ba49d18b8d 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -37,7 +37,7 @@ from sklearn.metrics import matthews_corrcoef, f1_score from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME -from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', @@ -857,18 +857,21 @@ def main(): optimizer.zero_grad() global_step += 1 - # Save a trained model and the associated configuration + # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) - torch.save(model_to_save.state_dict(), output_model_file) - output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - with open(output_config_file, 'w') as f: - f.write(model_to_save.config.to_json_string()) - # Load a trained model and config that you have fine-tuned - config = BertConfig(output_config_file) - model = BertForSequenceClassification(config, num_labels=num_labels) - model.load_state_dict(torch.load(output_model_file)) + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_vocab_file) + + # Load a trained model and vocabulary that you have fine-tuned + model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) + tokenizer = BertTokenizer.from_pretrained(args.output_dir) else: model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) model.to(device) diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py index ee30a7a0a4..1686c9eda6 100644 --- a/examples/run_openai_gpt.py +++ b/examples/run_openai_gpt.py @@ -40,6 +40,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path +from pytorch_pretrained_bert.modeling_openai import WEIGHTS_NAME, CONFIG_NAME ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz" @@ -218,15 +219,20 @@ def main(): # Save a trained model if args.do_train: + # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") - config = model.config - torch.save(model_to_save.state_dict(), output_model_file) - # Load a trained model that you have fine-tuned - model_state_dict = torch.load(output_model_file) - model = OpenAIGPTDoubleHeadsModel(config) - model.load_state_dict(model_state_dict) + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + # Load a trained model and vocabulary that you have fine-tuned + model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) + tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: diff --git a/examples/run_squad.py b/examples/run_squad.py index 043b795326..045c0afe1e 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -39,7 +39,7 @@ from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfi from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.tokenization import (BasicTokenizer, BertTokenizer, - whitespace_tokenize) + whitespace_tokenize, VOCAB_NAME) if sys.version_info[0] == 2: import cPickle as pickle @@ -1009,18 +1009,21 @@ def main(): global_step += 1 if args.do_train: - # Save a trained model and the associated configuration + # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) - torch.save(model_to_save.state_dict(), output_model_file) - output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - with open(output_config_file, 'w') as f: - f.write(model_to_save.config.to_json_string()) - # Load a trained model and config that you have fine-tuned - config = BertConfig(output_config_file) - model = BertForQuestionAnswering(config) - model.load_state_dict(torch.load(output_model_file)) + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_vocab_file) + + # Load a trained model and vocabulary that you have fine-tuned + model = BertForQuestionAnswering.from_pretrained(args.output_dir) + tokenizer = BertTokenizer.from_pretrained(args.output_dir) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) diff --git a/examples/run_swag.py b/examples/run_swag.py index f193582640..fa145c29d7 100644 --- a/examples/run_swag.py +++ b/examples/run_swag.py @@ -35,7 +35,7 @@ from tqdm import tqdm, trange from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.modeling import (BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME) from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear -from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', @@ -473,18 +473,21 @@ def main(): if args.do_train: - # Save a trained model and the associated configuration + # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) - torch.save(model_to_save.state_dict(), output_model_file) - output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - with open(output_config_file, 'w') as f: - f.write(model_to_save.config.to_json_string()) - # Load a trained model and config that you have fine-tuned - config = BertConfig(output_config_file) - model = BertForMultipleChoice(config, num_choices=4) - model.load_state_dict(torch.load(output_model_file)) + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_vocab_file) + + # Load a trained model and vocabulary that you have fine-tuned + model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) + tokenizer = BertTokenizer.from_pretrained(args.output_dir) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) From 60ea6c59d24f63681e120e704d2f823bfcc2c04e Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 15:00:33 +0200 Subject: [PATCH 26/47] added best practices for serialization in README and examples --- README.md | 76 +++++++++++++++++++ examples/run_classifier.py | 11 ++- examples/run_openai_gpt.py | 4 +- examples/run_squad.py | 11 ++- examples/run_swag.py | 11 ++- pytorch_pretrained_bert/__init__.py | 2 +- pytorch_pretrained_bert/file_utils.py | 3 + pytorch_pretrained_bert/modeling.py | 8 +- pytorch_pretrained_bert/modeling_gpt2.py | 5 +- pytorch_pretrained_bert/modeling_openai.py | 4 +- .../modeling_transfo_xl.py | 5 +- 11 files changed, 106 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 2a59bb0d37..2f725f1786 100644 --- a/README.md +++ b/README.md @@ -525,6 +525,82 @@ model = GPT2Model.from_pretrained('gpt2') ``` +### Serialization best-practices: saving and re-loading a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL) + +There are three types of files you need to save to be able to reload a fine-tuned model: + +- the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices), +- the configuration file of the model which is saved as a JSON file, and +- the vocabulary (and the merges for the BPE-based models GPT and GPT-2). + +Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards: + +```python +from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME + +output_dir = "./models/" + +# Step 1: Save a model, configuration and vocabulary that you have fine-tuned + +# If we have a distributed model, save only the encapsulated model +# (it was wrapped in PyTorch DistributedDataParallel or DataParallel) +model_to_save = model.module if hasattr(model, 'module') else model + +# If we save using the predefined names, we can load using `from_pretrained` +output_model_file = os.path.join(output_dir, WEIGHTS_NAME) +output_config_file = os.path.join(output_dir, CONFIG_NAME) + +torch.save(model_to_save.state_dict(), output_model_file) +model_to_save.config.to_json_file(output_config_file) +tokenizer.save_vocabulary(output_dir) + +# Step 2: Re-load the saved model and vocabulary + +# Example for a Bert model +model = BertForQuestionAnswering.from_pretrained(output_dir) +tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case) # Add specific options if needed +# Example for a GPT model +model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir) +tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir) +``` + +Here is another way you can save and reload the model if you want to use specific paths for each type of files: + +```python +output_model_file = "./models/my_own_model_file.bin" +output_config_file = "./models/my_own_config_file.bin" +output_vocab_file = "./models/my_own_vocab_file.bin" + +# Step 1: Save a model, configuration and vocabulary that you have fine-tuned + +# If we have a distributed model, save only the encapsulated model +# (it was wrapped in PyTorch DistributedDataParallel or DataParallel) +model_to_save = model.module if hasattr(model, 'module') else model + +torch.save(model_to_save.state_dict(), output_model_file) +model_to_save.config.to_json_file(output_config_file) +tokenizer.save_vocabulary(output_vocab_file) + +# Step 2: Re-load the saved model and vocabulary + +# We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`. +# Here is how to do it in this situation: + +# Example for a Bert model +config = BertConfig.from_json_file(output_config_file) +model = BertForQuestionAnswering(config) +state_dict = torch.load(output_model_file) +model.load_state_dict(state_dict) +tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case) + +# Example for a GPT model +config = OpenAIGPTConfig.from_json_file(output_config_file) +model = OpenAIGPTDoubleHeadsModel(config) +state_dict = torch.load(output_model_file) +model.load_state_dict(state_dict) +tokenizer = OpenAIGPTTokenizer(output_vocab_file) +``` + ### Configuration classes Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are: diff --git a/examples/run_classifier.py b/examples/run_classifier.py index ba49d18b8d..46a428b3b8 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -35,9 +35,9 @@ from torch.nn import CrossEntropyLoss, MSELoss from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef, f1_score -from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE -from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME -from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME +from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME +from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig +from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', @@ -863,15 +863,14 @@ def main(): # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(output_vocab_file) + tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) - tokenizer = BertTokenizer.from_pretrained(args.output_dir) + tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) model.to(device) diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py index 1686c9eda6..cb5aa8d9cb 100644 --- a/examples/run_openai_gpt.py +++ b/examples/run_openai_gpt.py @@ -39,8 +39,8 @@ import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) -from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path -from pytorch_pretrained_bert.modeling_openai import WEIGHTS_NAME, CONFIG_NAME +from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, + OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME) ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz" diff --git a/examples/run_squad.py b/examples/run_squad.py index 045c0afe1e..14e6bd7ab8 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -34,12 +34,12 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange -from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE -from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME +from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME +from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.tokenization import (BasicTokenizer, BertTokenizer, - whitespace_tokenize, VOCAB_NAME) + whitespace_tokenize) if sys.version_info[0] == 2: import cPickle as pickle @@ -1015,15 +1015,14 @@ def main(): # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(output_vocab_file) + tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) - tokenizer = BertTokenizer.from_pretrained(args.output_dir) + tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) diff --git a/examples/run_swag.py b/examples/run_swag.py index fa145c29d7..a6cfdbe311 100644 --- a/examples/run_swag.py +++ b/examples/run_swag.py @@ -32,10 +32,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange -from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE -from pytorch_pretrained_bert.modeling import (BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME) +from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME +from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear -from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME +from pytorch_pretrained_bert.tokenization import BertTokenizer logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', @@ -479,15 +479,14 @@ def main(): # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(output_vocab_file) + tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) - tokenizer = BertTokenizer.from_pretrained(args.output_dir) + tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index bd455b8d9c..28d215d8bd 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -21,4 +21,4 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model, from .optimization import BertAdam from .optimization_openai import OpenAIAdam -from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path +from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index 8601edde23..6de7e259e5 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -33,6 +33,9 @@ except (AttributeError, ImportError): PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) +CONFIG_NAME = "config.json" +WEIGHTS_NAME = "pytorch_model.bin" + logger = logging.getLogger(__name__) # pylint: disable=invalid-name diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index 6a71cbeea6..dca6ac53f2 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -32,7 +32,7 @@ import torch from torch import nn from torch.nn import CrossEntropyLoss -from .file_utils import cached_path +from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME logger = logging.getLogger(__name__) @@ -45,8 +45,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", } -CONFIG_NAME = 'bert_config.json' -WEIGHTS_NAME = 'pytorch_model.bin' +BERT_CONFIG_NAME = 'bert_config.json' TF_WEIGHTS_NAME = 'model.ckpt' def load_tf_weights_in_bert(model, tf_checkpoint_path): @@ -586,6 +585,9 @@ class BertPreTrainedModel(nn.Module): serialization_dir = tempdir # Load config config_file = os.path.join(serialization_dir, CONFIG_NAME) + if not os.path.exists(config_file): + # Backward compatibility with old naming format + config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME) config = BertConfig.from_json_file(config_file) logger.info("Model config {}".format(config)) # Instantiate model. diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index fce564e9ea..e6017d33e4 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -34,7 +34,7 @@ import torch.nn as nn from torch.nn import CrossEntropyLoss from torch.nn.parameter import Parameter -from .file_utils import cached_path +from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME from .modeling import BertLayerNorm as LayerNorm logger = logging.getLogger(__name__) @@ -42,9 +42,6 @@ logger = logging.getLogger(__name__) PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"} PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"} -CONFIG_NAME = "config.json" -WEIGHTS_NAME = "pytorch_model.bin" - def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path): """ Load tf checkpoints in a pytorch model """ diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 33bb4472a5..57a7921d7a 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -34,7 +34,7 @@ import torch.nn as nn from torch.nn import CrossEntropyLoss from torch.nn.parameter import Parameter -from .file_utils import cached_path +from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME from .modeling import BertLayerNorm as LayerNorm logger = logging.getLogger(__name__) @@ -42,8 +42,6 @@ logger = logging.getLogger(__name__) PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"} PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"} -CONFIG_NAME = "config.json" -WEIGHTS_NAME = "pytorch_model.bin" def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path): """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here) diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py index 0ba986f5b4..0b732cdef1 100644 --- a/pytorch_pretrained_bert/modeling_transfo_xl.py +++ b/pytorch_pretrained_bert/modeling_transfo_xl.py @@ -40,7 +40,7 @@ from torch.nn.parameter import Parameter from .modeling import BertLayerNorm as LayerNorm from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits -from .file_utils import cached_path +from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME logger = logging.getLogger(__name__) @@ -50,8 +50,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = { PRETRAINED_CONFIG_ARCHIVE_MAP = { 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", } -CONFIG_NAME = 'config.json' -WEIGHTS_NAME = 'pytorch_model.bin' + TF_WEIGHTS_NAME = 'model.ckpt' def build_tf_to_pytorch_map(model, config): From cc433070238d8e3c093b12cb2b9ba34028adce93 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 15:06:10 +0200 Subject: [PATCH 27/47] update readme --- README.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2f725f1786..0a61992efc 100644 --- a/README.md +++ b/README.md @@ -462,10 +462,12 @@ Here is a detailed documentation of the classes in the package and how to use th | Sub-section | Description | |-|-| -| [Loading Google AI's/OpenAI's pre-trained weights](#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance | -| [PyTorch models](#PyTorch-models) | API of the BERT, GPT, GPT-2 and Transformer-XL PyTorch model classes | +| [Loading pre-trained weights](#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance | +| [Serialization best-practices](#serialization-best-practices) | How to save and reload a fine-tuned model | +| [Configurations](#configurations) | API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL | +| [Models](#models) | API of the PyTorch model classes for BERT, GPT, GPT-2 and Transformer-XL | | [Tokenizers](#tokenizers) | API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL| -| [Optimizers](#optimizerss) | API of the optimizers | +| [Optimizers](#optimizers) | API of the optimizers | ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump @@ -525,8 +527,9 @@ model = GPT2Model.from_pretrained('gpt2') ``` -### Serialization best-practices: saving and re-loading a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL) +### Serialization best-practices +This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL). There are three types of files you need to save to be able to reload a fine-tuned model: - the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices), @@ -601,7 +604,7 @@ model.load_state_dict(state_dict) tokenizer = OpenAIGPTTokenizer(output_vocab_file) ``` -### Configuration classes +### Configurations Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are: @@ -618,7 +621,7 @@ These configuration classes contains a few utilities to load and save configurat - `to_json_string()`: Serializes an instance to a JSON string. Returns a string. - `to_json_file(json_file_path)`: Save an instance to a json file. -### PyTorch models +### Models #### 1. `BertModel` From 1135f2384ab735759e8c1a0643dba938e4e609ea Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 15:22:40 +0200 Subject: [PATCH 28/47] clean up logger in examples for distributed case --- README.md | 16 ++++++++++------ examples/run_classifier.py | 8 +++++--- examples/run_squad.py | 8 +++++--- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0a61992efc..caf415508f 100644 --- a/README.md +++ b/README.md @@ -1274,18 +1274,20 @@ To get these results we used a combination of: Here is the full list of hyper-parameters for this run: ```bash +export SQUAD_DIR=/path/to/SQUAD + python ./run_squad.py \ --bert_model bert-large-uncased \ --do_train \ --do_predict \ --do_lower_case \ - --train_file $SQUAD_TRAIN \ - --predict_file $SQUAD_EVAL \ + --train_file $SQUAD_DIR/train-v1.1.json \ + --predict_file $SQUAD_DIR/dev-v1.1.json \ --learning_rate 3e-5 \ --num_train_epochs 2 \ --max_seq_length 384 \ --doc_stride 128 \ - --output_dir $OUTPUT_DIR \ + --output_dir /tmp/debug_squad/ \ --train_batch_size 24 \ --gradient_accumulation_steps 2 ``` @@ -1294,18 +1296,20 @@ If you have a recent GPU (starting from NVIDIA Volta series), you should try **1 Here is an example of hyper-parameters for a FP16 run we tried: ```bash +export SQUAD_DIR=/path/to/SQUAD + python ./run_squad.py \ --bert_model bert-large-uncased \ --do_train \ --do_predict \ --do_lower_case \ - --train_file $SQUAD_TRAIN \ - --predict_file $SQUAD_EVAL \ + --train_file $SQUAD_DIR/train-v1.1.json \ + --predict_file $SQUAD_DIR/dev-v1.1.json \ --learning_rate 3e-5 \ --num_train_epochs 2 \ --max_seq_length 384 \ --doc_stride 128 \ - --output_dir $OUTPUT_DIR \ + --output_dir /tmp/debug_squad/ \ --train_batch_size 24 \ --fp16 \ --loss_scale 128 diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 46a428b3b8..112be6fbcb 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -40,9 +40,6 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification, Bert from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) logger = logging.getLogger(__name__) @@ -697,6 +694,11 @@ def main(): n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') + + logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) diff --git a/examples/run_squad.py b/examples/run_squad.py index 14e6bd7ab8..00ee368b14 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -46,9 +46,6 @@ if sys.version_info[0] == 2: else: import pickle -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) logger = logging.getLogger(__name__) @@ -848,6 +845,11 @@ def main(): n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') + + logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) From 7816f7921fd5a21fdc74ca0f29589c74bceed0e2 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 15:27:10 +0200 Subject: [PATCH 29/47] clean up distributed training logging in run_squad example --- examples/run_squad.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 00ee368b14..bad46203bc 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -985,7 +985,7 @@ def main(): model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): - for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch @@ -1058,7 +1058,7 @@ def main(): model.eval() all_results = [] logger.info("Start evaluating") - for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): + for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) From 2499b0a5fcdb168ccb0095e837b2022953935af2 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 15:33:04 +0200 Subject: [PATCH 30/47] add ptvsd to run_squad --- examples/run_squad.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/run_squad.py b/examples/run_squad.py index bad46203bc..cd85219f5f 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -834,7 +834,17 @@ def main(): parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") + parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() + print(args) + + if args.server_ip and args.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) + ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") From df5d9c3551a6405feb697a1cad903dddffa04bfe Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 15:43:01 +0200 Subject: [PATCH 31/47] load all models on cpu --- pytorch_pretrained_bert/modeling.py | 2 +- pytorch_pretrained_bert/modeling_gpt2.py | 2 +- pytorch_pretrained_bert/modeling_openai.py | 2 +- pytorch_pretrained_bert/modeling_transfo_xl.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index dca6ac53f2..8dfb5fe51e 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -594,7 +594,7 @@ class BertPreTrainedModel(nn.Module): model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) - state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None) + state_dict = torch.load(weights_path, map_location='cpu') if tempdir: # Clean up temp dir shutil.rmtree(tempdir) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index e6017d33e4..7cf1e6b59d 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -418,7 +418,7 @@ class GPT2PreTrainedModel(nn.Module): # Instantiate model. model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: - state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None) + state_dict = torch.load(resolved_archive_file, map_location='cpu') if from_tf: # Directly load from a TensorFlow checkpoint (stored as NumPy array) return load_tf_weights_in_gpt2(model, resolved_archive_file) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 57a7921d7a..3dedc53f11 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -476,7 +476,7 @@ class OpenAIGPTPreTrainedModel(nn.Module): # Instantiate model. model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: - state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None) + state_dict = torch.load(resolved_archive_file, map_location='cpu') if from_tf: # Directly load from a TensorFlow checkpoint (stored as NumPy array) return load_tf_weights_in_openai_gpt(model, resolved_archive_file) diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py index 0b732cdef1..e8fffc5b60 100644 --- a/pytorch_pretrained_bert/modeling_transfo_xl.py +++ b/pytorch_pretrained_bert/modeling_transfo_xl.py @@ -944,7 +944,7 @@ class TransfoXLPreTrainedModel(nn.Module): # Instantiate model. model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: - state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None) + state_dict = torch.load(resolved_archive_file, map_location='cpu') if from_tf: # Directly load from a TensorFlow checkpoint return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path) From d61602245566b1e42dca9238b3b8a0f23f3fdad1 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 16:07:45 +0200 Subject: [PATCH 32/47] fix openai special tokens loading --- pytorch_pretrained_bert/tokenization_openai.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index d9713e51eb..7a10271175 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -87,6 +87,7 @@ class OpenAIGPTTokenizer(object): if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] + special_tokens_file = None else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) From 3571187ef6f07a7ba63ee5b355e312f2fbfaaab7 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 15 Apr 2019 16:43:56 +0200 Subject: [PATCH 33/47] fix saving models in distributed setting examples --- examples/run_classifier.py | 1 + examples/run_squad.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 112be6fbcb..4994118467 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -859,6 +859,7 @@ def main(): optimizer.zero_grad() global_step += 1 + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self diff --git a/examples/run_squad.py b/examples/run_squad.py index cd85219f5f..410fd85298 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -1020,7 +1020,7 @@ def main(): optimizer.zero_grad() global_step += 1 - if args.do_train: + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self From 18a8a15f78a10ac6bf272bc762232b3f16df30e2 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 16 Apr 2019 17:00:55 +0200 Subject: [PATCH 34/47] improving GPT2 tokenization and adding tests --- README.md | 7 +- pytorch_pretrained_bert/tokenization_gpt2.py | 96 +++++++++++++++---- .../tokenization_openai.py | 15 ++- tests/tokenization_gpt2_test.py | 68 +++++++++++++ tests/tokenization_openai_test.py | 17 ++-- 5 files changed, 169 insertions(+), 34 deletions(-) create mode 100644 tests/tokenization_gpt2_test.py diff --git a/README.md b/README.md index caf415508f..fde35d23ea 100644 --- a/README.md +++ b/README.md @@ -929,10 +929,11 @@ This class has four arguments: and five methods: -- `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization. +- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing BPE tokenization. - `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary. - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary. - `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments) +- `encode(text)`: convert a `str` in a list of `int` tokens by performing BPE encoding. - `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces. - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`. @@ -958,6 +959,10 @@ This class has three arguments: and two methods: +- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing byte-level BPE. +- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary. +- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary. +- `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments) - `encode(text)`: convert a `str` in a list of `int` tokens by performing byte-level BPE. - `decode(tokens)`: convert back a list of `int` tokens in a `str`. - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`. diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index ab80876ee5..491db616e4 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -16,6 +16,7 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) +import sys import json import logging import os @@ -138,7 +139,7 @@ class GPT2Tokenizer(object): tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) return tokenizer - def __init__(self, vocab_file, merges_file, errors='replace', max_len=None): + def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v:k for k,v in self.encoder.items()} @@ -153,8 +154,25 @@ class GPT2Tokenizer(object): # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + self.special_tokens = {} + self.special_tokens_decoder = {} + self.set_special_tokens(special_tokens) + def __len__(self): - return len(self.encoder) + return len(self.encoder) + len(self.special_tokens) + + def set_special_tokens(self, special_tokens): + """ Add a list of additional tokens to the encoder. + The additional tokens are indexed starting from the last index of the + current vocabulary in the order of the `special_tokens` list. + """ + if not special_tokens: + self.special_tokens = {} + self.special_tokens_decoder = {} + return + self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) + self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()} + logger.info("Special tokens {}".format(self.special_tokens)) def bpe(self, token): if token in self.cache: @@ -197,6 +215,54 @@ class GPT2Tokenizer(object): self.cache[token] = word return word + def tokenize(self, text): + """ Tokenize a string. """ + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a sequence of tokens into ids using the vocab. """ + ids = [] + if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.encoder.get(tokens, 0) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.encoder.get(token, 0)) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this OpenAI GPT model ({} > {}). Running this" + " sequence through the model will result in indexing errors".format(len(ids), self.max_len) + ) + return ids + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a sequence of ids in BPE tokens using the vocab.""" + tokens = [] + for i in ids: + if i in self.special_tokens_decoder: + if not skip_special_tokens: + tokens.append(self.special_tokens_decoder[i]) + else: + tokens.append(self.decoder[i]) + return tokens + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + return text + def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary and merge files to a directory.""" if not os.path.isdir(vocab_path): @@ -220,26 +286,14 @@ class GPT2Tokenizer(object): writer.write(' '.join(bpe_tokens) + u'\n') index += 1 + index = len(self.encoder) with open(special_tokens_file, 'w', encoding='utf-8') as writer: - for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]): + for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(special_tokens_file)) + index = token_index writer.write(token + u'\n') + index += 1 return vocab_file, merge_file, special_tokens_file - - def encode(self, text): - bpe_tokens = [] - for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) - bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) - if len(bpe_tokens) > self.max_len: - logger.warning( - "Token indices sequence length is longer than the specified maximum " - " sequence length for this OpenAI GPT-2 model ({} > {}). Running this" - " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len) - ) - return bpe_tokens - - def decode(self, tokens): - text = ''.join([self.decoder[token] for token in tokens]) - text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) - return text diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index 7a10271175..1088b5222b 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -150,6 +150,8 @@ class OpenAIGPTTokenizer(object): merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + self.special_tokens = {} + self.special_tokens_decoder = {} self.set_special_tokens(special_tokens) def __len__(self): @@ -261,7 +263,10 @@ class OpenAIGPTTokenizer(object): tokens.append(self.decoder[i]) return tokens - def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False): + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): """Converts a sequence of ids in a string.""" tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens) out_string = ''.join(tokens).replace('', ' ').strip() @@ -296,8 +301,14 @@ class OpenAIGPTTokenizer(object): writer.write(' '.join(bpe_tokens) + u'\n') index += 1 + index = len(self.encoder) with open(special_tokens_file, 'w', encoding='utf-8') as writer: - for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]): + for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(special_tokens_file)) + index = token_index writer.write(token + u'\n') + index += 1 return vocab_file, merge_file, special_tokens_file diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py new file mode 100644 index 0000000000..29633bc17c --- /dev/null +++ b/tests/tokenization_gpt2_test.py @@ -0,0 +1,68 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import unittest +import json + +from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer + + +class GPT2TokenizationTest(unittest.TestCase): + + def test_full_tokenizer(self): + """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", + "lo", "low", "er", + "low", "lowest", "newer", "wider"] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "l o", "lo w", "e r", ""] + with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp: + json.dump(vocab_tokens, fp) + vocab_file = fp.name + with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp: + fp.write("\n".join(merges)) + merges_file = fp.name + + tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["", ""]) + os.remove(vocab_file) + os.remove(merges_file) + + text = "lower" + bpe_tokens = ["low", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [""] + input_bpe_tokens = [13, 12, 16] + self.assertListEqual( + tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/") + tokenizer_2 = GPT2Tokenizer.from_pretrained("/tmp/") + os.remove(vocab_file) + os.remove(merges_file) + os.remove(special_tokens_file) + + self.assertListEqual( + [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks, + tokenizer.special_tokens, tokenizer.special_tokens_decoder], + [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, + tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py index 1f695cfb12..fb42cdd8cb 100644 --- a/tests/tokenization_openai_test.py +++ b/tests/tokenization_openai_test.py @@ -38,7 +38,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): fp.write("\n".join(merges)) merges_file = fp.name - tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=[""]) + tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["", ""]) os.remove(vocab_file) os.remove(merges_file) @@ -53,19 +53,16 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/") - tokenizer.from_pretrained("/tmp/") + tokenizer_2 = OpenAIGPTTokenizer.from_pretrained("/tmp/") os.remove(vocab_file) os.remove(merges_file) + os.remove(special_tokens_file) - text = "lower" - bpe_tokens = ["low", "er"] - tokens = tokenizer.tokenize(text) - self.assertListEqual(tokens, bpe_tokens) - - input_tokens = tokens + [""] - input_bpe_tokens = [14, 15, 20] self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks, + tokenizer.special_tokens, tokenizer.special_tokens_decoder], + [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, + tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder]) if __name__ == '__main__': From bdaba1897c14e0243d7fb58ddf5061957c70eea6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 16 Apr 2019 17:44:06 +0200 Subject: [PATCH 35/47] updating GPT tokenization --- pytorch_pretrained_bert/tokenization_openai.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index 1088b5222b..214a476ce9 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -273,9 +273,8 @@ class OpenAIGPTTokenizer(object): if clean_up_tokenization_spaces: out_string = out_string.replace('', '') out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ',' - ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't" - ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m " - ).replace(" 've", "'ve") + ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") return out_string def save_vocabulary(self, vocab_path): From 07154dadb4fc5ce47e3d82dd33debb8e588039bd Mon Sep 17 00:00:00 2001 From: Abhi Sharma <18308855+SudoSharma@users.noreply.github.com> Date: Tue, 16 Apr 2019 11:11:49 -0700 Subject: [PATCH 36/47] Fix indentation for unconditional generation --- examples/run_gpt2.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py index 61fbf9f323..9ac2b31961 100644 --- a/examples/run_gpt2.py +++ b/examples/run_gpt2.py @@ -107,25 +107,25 @@ def run_model(): print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80) - if args.unconditional: - generated = 0 - for _ in range(args.nsamples // args.batch_size): - out = sample_sequence( - model=model, length=args.length, - context=None, - start_token=enc.encoder['<|endoftext|>'], - batch_size=args.batch_size, - temperature=args.temperature, top_k=args.top_k, device=device - ) - out = out[:,1:].tolist() - for i in range(args.batch_size): - generated += 1 - text = enc.decode(out[i]) - print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) - print(text) - print("=" * 80) - if args.unconditional: - break + if args.unconditional: + generated = 0 + for _ in range(args.nsamples // args.batch_size): + out = sample_sequence( + model=model, length=args.length, + context=None, + start_token=enc.encoder['<|endoftext|>'], + batch_size=args.batch_size, + temperature=args.temperature, top_k=args.top_k, device=device + ) + out = out[:,1:].tolist() + for i in range(args.batch_size): + generated += 1 + text = enc.decode(out[i]) + print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) + print(text) + print("=" * 80) + if args.unconditional: + break if __name__ == '__main__': run_model() From 9e666aaa297a84f8276cd891cd1a151e5266349e Mon Sep 17 00:00:00 2001 From: Abhi Sharma <18308855+SudoSharma@users.noreply.github.com> Date: Tue, 16 Apr 2019 11:42:34 -0700 Subject: [PATCH 37/47] Fix gradient overflow issue during attention mask This fix is in reference to issue #382. GPT2 can now be trained in mixed precision, which I've confirmed with testing. I also tested unconditional generation on multiple seeds before and after changing 1e10 to 1e4 and there was no difference. Please let me know if there is anything else I can do to make this pull request better. Thanks for all your work! --- pytorch_pretrained_bert/modeling_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 7cf1e6b59d..063c525d98 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -218,7 +218,7 @@ class Attention(nn.Module): w = w / math.sqrt(v.size(-1)) nd, ns = w.size(-2), w.size(-1) b = self.bias[:, :, ns-nd:ns, :ns] - w = w * b - 1e10 * (1 - b) + w = w * b - 1e4 * (1 - b) w = nn.Softmax(dim=-1)(w) return torch.matmul(w, v) From 87677fcc4dfda7ee9e0b5609344b46d6e3ccd227 Mon Sep 17 00:00:00 2001 From: Ben Mann <8enmann@gmail.com> Date: Tue, 16 Apr 2019 15:23:21 -0700 Subject: [PATCH 38/47] [run_gpt2.py] temperature should be a float, not int --- examples/run_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py index 61fbf9f323..4b081d3a1d 100644 --- a/examples/run_gpt2.py +++ b/examples/run_gpt2.py @@ -58,7 +58,7 @@ def run_model(): parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=-1) parser.add_argument("--length", type=int, default=-1) - parser.add_argument("--temperature", type=int, default=1) + parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') args = parser.parse_args() From bc70779bf0dc7a1b59eeb65d106d1116feb4a828 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 10:56:15 +0200 Subject: [PATCH 39/47] fixed GPT-2 tokenization on python 2 --- pytorch_pretrained_bert/file_utils.py | 2 +- pytorch_pretrained_bert/tokenization_gpt2.py | 5 +++-- tests/tokenization_gpt2_test.py | 3 ++- tests/tokenization_openai_test.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index 6de7e259e5..6a24b099e1 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -227,7 +227,7 @@ def get_from_cache(url, cache_dir=None): meta = {'url': url, 'etag': etag} meta_path = cache_path + '.json' with open(meta_path, 'w', encoding="utf-8") as meta_file: - json.dump(meta, meta_file) + meta_file.write(json.dumps(meta)) logger.info("removing temp file %s", temp_file.name) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 491db616e4..0e91498f22 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -59,6 +59,7 @@ def bytes_to_unicode(): To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ + _chr = unichr if sys.version_info[0] == 2 else chr bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) cs = bs[:] n = 0 @@ -67,7 +68,7 @@ def bytes_to_unicode(): bs.append(b) cs.append(2**8+n) n += 1 - cs = [chr(n) for n in cs] + cs = [_chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): @@ -219,7 +220,7 @@ class GPT2Tokenizer(object): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8')) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py index 29633bc17c..0773574360 100644 --- a/tests/tokenization_gpt2_test.py +++ b/tests/tokenization_gpt2_test.py @@ -31,13 +31,14 @@ class GPT2TokenizationTest(unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp: - json.dump(vocab_tokens, fp) + fp.write(json.dumps(vocab_tokens)) vocab_file = fp.name with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp: fp.write("\n".join(merges)) merges_file = fp.name tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["", ""]) + print("encoder", tokenizer.byte_encoder) os.remove(vocab_file) os.remove(merges_file) diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py index fb42cdd8cb..2011ccc1df 100644 --- a/tests/tokenization_openai_test.py +++ b/tests/tokenization_openai_test.py @@ -32,7 +32,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp: - json.dump(vocab_tokens, fp) + fp.write(json.dumps(vocab_tokens)) vocab_file = fp.name with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp: fp.write("\n".join(merges)) From 5afa497cbfc53c679a9b22997b6312fad57ee2f8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 11:04:41 +0200 Subject: [PATCH 40/47] fix GPT-2 tokenization to work also on python 3... --- pytorch_pretrained_bert/tokenization_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 0e91498f22..80be4435df 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -220,7 +220,7 @@ class GPT2Tokenizer(object): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8')) + token = ''.join(self.byte_encoder[ord(b)] for b in token) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens From 31d387604c67d738740a9ae9350df0a273802966 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 11:58:27 +0200 Subject: [PATCH 41/47] adding s3 model tests with --runslow --- .circleci/config.yml | 4 ++-- tests/conftest.py | 19 +++++++++++++++++++ tests/modeling_gpt2_test.py | 12 +++++++++++- tests/modeling_openai_test.py | 12 +++++++++++- tests/modeling_test.py | 11 +++++++++++ tests/modeling_transfo_xl_test.py | 12 +++++++++++- tests/tokenization_gpt2_test.py | 11 ++++++++++- tests/tokenization_openai_test.py | 12 +++++++++++- tests/tokenization_test.py | 11 ++++++++++- tests/tokenization_transfo_xl_test.py | 11 ++++++++++- 10 files changed, 106 insertions(+), 9 deletions(-) create mode 100644 tests/conftest.py diff --git a/.circleci/config.yml b/.circleci/config.yml index b57b478030..7296e07ca3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,7 +9,7 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest ftfy spacy - run: sudo python -m spacy download en - - run: python -m pytest -sv tests/ + - run: python -m pytest -sv tests/ --runslow build_py2: working_directory: ~/pytorch-pretrained-BERT docker: @@ -20,7 +20,7 @@ jobs: - run: sudo pip install pytest spacy - run: sudo pip install ftfy==4.4.3 - run: sudo python -m spacy download en - - run: python -m pytest -sv tests/ + - run: python -m pytest -sv tests/ --runslow workflows: version: 2 build_and_test: diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000..841ebc8df9 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +# content of conftest.py + +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--runslow", action="store_true", default=False, help="run slow tests" + ) + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runslow"): + # --runslow given in cli: do not skip slow tests + return + skip_slow = pytest.mark.skip(reason="need --runslow option to run") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py index d542422060..8f4581b37f 100644 --- a/tests/modeling_gpt2_test.py +++ b/tests/modeling_gpt2_test.py @@ -20,12 +20,14 @@ import os import unittest import json import random +import shutil +import pytest import torch from pytorch_pretrained_bert import (GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) - +from pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP class GPT2ModelTest(unittest.TestCase): class GPT2ModelTester(object): @@ -185,6 +187,14 @@ class GPT2ModelTest(unittest.TestCase): os.remove(json_file_path) self.assertEqual(config_second.to_dict(), config_first.to_dict()) + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() output_result = tester.create_gpt2_model(*config_and_inputs) diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py index db03bf792e..4e7d9d542b 100644 --- a/tests/modeling_openai_test.py +++ b/tests/modeling_openai_test.py @@ -20,12 +20,14 @@ import os import unittest import json import random +import shutil +import pytest import torch from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) - +from pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP class OpenAIGPTModelTest(unittest.TestCase): class OpenAIGPTModelTester(object): @@ -197,6 +199,14 @@ class OpenAIGPTModelTest(unittest.TestCase): os.remove(json_file_path) self.assertEqual(config_second.to_dict(), config_first.to_dict()) + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() output_result = tester.create_openai_model(*config_and_inputs) diff --git a/tests/modeling_test.py b/tests/modeling_test.py index 02d7a13fda..5cde383fdf 100644 --- a/tests/modeling_test.py +++ b/tests/modeling_test.py @@ -20,6 +20,8 @@ import os import unittest import json import random +import shutil +import pytest import torch @@ -27,6 +29,7 @@ from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM, BertForNextSentencePrediction, BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification) +from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP class BertModelTest(unittest.TestCase): @@ -260,6 +263,14 @@ class BertModelTest(unittest.TestCase): os.remove(json_file_path) self.assertEqual(config_second.to_dict(), config_first.to_dict()) + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = BertModel.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() output_result = tester.create_bert_model(*config_and_inputs) diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py index a59d90b205..e5c5f3d163 100644 --- a/tests/modeling_transfo_xl_test.py +++ b/tests/modeling_transfo_xl_test.py @@ -20,11 +20,13 @@ import os import unittest import json import random +import shutil +import pytest import torch from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) - +from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP class TransfoXLModelTest(unittest.TestCase): class TransfoXLModelTester(object): @@ -195,6 +197,14 @@ class TransfoXLModelTest(unittest.TestCase): os.remove(json_file_path) self.assertEqual(config_second.to_dict(), config_first.to_dict()) + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + def run_tester(self, tester): config_and_inputs = tester.prepare_config_and_inputs() diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py index 0773574360..870f61ca79 100644 --- a/tests/tokenization_gpt2_test.py +++ b/tests/tokenization_gpt2_test.py @@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest import json +import shutil +import pytest -from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer +from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP class GPT2TokenizationTest(unittest.TestCase): @@ -64,6 +66,13 @@ class GPT2TokenizationTest(unittest.TestCase): [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder]) + @pytest.mark.slow + def test_tokenizer_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]: + tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(tokenizer) if __name__ == '__main__': unittest.main() diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py index 2011ccc1df..a57f86be57 100644 --- a/tests/tokenization_openai_test.py +++ b/tests/tokenization_openai_test.py @@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest import json +import shutil +import pytest -from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer +from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP class OpenAIGPTTokenizationTest(unittest.TestCase): @@ -64,6 +66,14 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder]) + @pytest.mark.slow + def test_tokenizer_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]: + tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(tokenizer) + if __name__ == '__main__': unittest.main() diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py index 15cc7ccd82..fe120a522c 100644 --- a/tests/tokenization_test.py +++ b/tests/tokenization_test.py @@ -17,12 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest from io import open +import shutil +import pytest from pytorch_pretrained_bert.tokenization import (BasicTokenizer, BertTokenizer, WordpieceTokenizer, _is_control, _is_punctuation, - _is_whitespace) + _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP) class TokenizationTest(unittest.TestCase): @@ -56,6 +58,13 @@ class TokenizationTest(unittest.TestCase): self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + @pytest.mark.slow + def test_tokenizer_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]: + tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(tokenizer) def test_chinese(self): tokenizer = BasicTokenizer() diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py index 1a805f11e6..bf0ac5db2f 100644 --- a/tests/tokenization_transfo_xl_test.py +++ b/tests/tokenization_transfo_xl_test.py @@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest from io import open +import shutil +import pytest -from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer +from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP class TransfoXLTokenizationTest(unittest.TestCase): @@ -66,6 +68,13 @@ class TransfoXLTokenizationTest(unittest.TestCase): tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]) + @pytest.mark.slow + def test_tokenizer_from_pretrained(self): + cache_dir = "/tmp/pytorch_pretrained_bert_test/" + for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]: + tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(tokenizer) if __name__ == '__main__': unittest.main() From 929579f3b5198185a5dd7f09eeee646b5f294398 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 12:35:08 +0200 Subject: [PATCH 42/47] fix #497 --- pytorch_pretrained_bert/tokenization_gpt2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 80be4435df..07777292a3 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -97,6 +97,7 @@ class GPT2Tokenizer(object): if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] + special_tokens_file = None else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) From bcde2c61cb2a9b4b5c2b2234e2e8cee505e695e8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 12:35:38 +0200 Subject: [PATCH 43/47] fix #497 --- pytorch_pretrained_bert/tokenization_gpt2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 80be4435df..07777292a3 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -97,6 +97,7 @@ class GPT2Tokenizer(object): if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] + special_tokens_file = None else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) From fa765202402499486efd1cb3484c5e70555479c2 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 13:32:22 +0200 Subject: [PATCH 44/47] fix file_utils on python 2 --- pytorch_pretrained_bert/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index 6a24b099e1..6de7e259e5 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -227,7 +227,7 @@ def get_from_cache(url, cache_dir=None): meta = {'url': url, 'etag': etag} meta_path = cache_path + '.json' with open(meta_path, 'w', encoding="utf-8") as meta_file: - meta_file.write(json.dumps(meta)) + json.dump(meta, meta_file) logger.info("removing temp file %s", temp_file.name) From 265550ec34bfa756538c60e0d5d4c906ee78e1ce Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 14:22:35 +0200 Subject: [PATCH 45/47] relax network connection requirements --- pytorch_pretrained_bert/file_utils.py | 29 ++++++++++++++++++++------- tests/tokenization_gpt2_test.py | 2 +- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index 6de7e259e5..e7e1714f97 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -5,11 +5,13 @@ Copyright by the AllenNLP authors. """ from __future__ import (absolute_import, division, print_function, unicode_literals) +import sys import json import logging import os import shutil import tempfile +import fnmatch from functools import wraps from hashlib import sha256 import sys @@ -191,17 +193,30 @@ def get_from_cache(url, cache_dir=None): if url.startswith("s3://"): etag = s3_etag(url) else: - response = requests.head(url, allow_redirects=True) - if response.status_code != 200: - raise IOError("HEAD request failed for url {} with status code {}" - .format(url, response.status_code)) - etag = response.headers.get("ETag") + try: + response = requests.head(url, allow_redirects=True) + if response.status_code != 200: + etag = None + else: + etag = response.headers.get("ETag") + except EnvironmentError: + etag = None + if sys.version_info[0] == 2 and etag is not None: + etag = etag.decode('utf-8') filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) + # If we don't have a connection (etag is None) and can't identify the file + # try to get the last downloaded one + if not os.path.exists(cache_path) and etag is None: + matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') + matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) + if matching_files: + cache_path = os.path.join(cache_dir, matching_files[-1]) + if not os.path.exists(cache_path): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. @@ -226,8 +241,8 @@ def get_from_cache(url, cache_dir=None): logger.info("creating metadata file for %s", cache_path) meta = {'url': url, 'etag': etag} meta_path = cache_path + '.json' - with open(meta_path, 'w', encoding="utf-8") as meta_file: - json.dump(meta, meta_file) + with open(meta_path, 'w') as meta_file: + meta_file.write(json.dumps(meta, indent=4)) logger.info("removing temp file %s", temp_file.name) diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py index 870f61ca79..cfd13de391 100644 --- a/tests/tokenization_gpt2_test.py +++ b/tests/tokenization_gpt2_test.py @@ -66,7 +66,7 @@ class GPT2TokenizationTest(unittest.TestCase): [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder]) - @pytest.mark.slow + # @pytest.mark.slow def test_tokenizer_from_pretrained(self): cache_dir = "/tmp/pytorch_pretrained_bert_test/" for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]: From 23d4554ec05d6cf5b35960052de8f324b7e0ec86 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 14:48:34 +0200 Subject: [PATCH 46/47] is python 2 happy now --- pytorch_pretrained_bert/file_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index e7e1714f97..17bdd258ea 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -242,7 +242,10 @@ def get_from_cache(url, cache_dir=None): meta = {'url': url, 'etag': etag} meta_path = cache_path + '.json' with open(meta_path, 'w') as meta_file: - meta_file.write(json.dumps(meta, indent=4)) + output_string = json.dumps(meta) + if sys.version_info[0] == 2 and isinstance(output_string, str): + output_string = unicode(output_string, 'utf-8') # The beauty of python 2 + meta_file.write(output_string) logger.info("removing temp file %s", temp_file.name) From 34ae5bf8385cce8f792c803fa288eccf472003ff Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 17 Apr 2019 14:52:12 +0200 Subject: [PATCH 47/47] small clean up in tests --- tests/tokenization_gpt2_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py index cfd13de391..4ae804a060 100644 --- a/tests/tokenization_gpt2_test.py +++ b/tests/tokenization_gpt2_test.py @@ -40,7 +40,6 @@ class GPT2TokenizationTest(unittest.TestCase): merges_file = fp.name tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["", ""]) - print("encoder", tokenizer.byte_encoder) os.remove(vocab_file) os.remove(merges_file)