From f872eb98c2b046c9f806b8cf0196d48e77c17899 Mon Sep 17 00:00:00 2001
From: dhanajitb <dhana.phoenix@gmail.com>
Date: Thu, 28 Mar 2019 22:46:15 +0530
Subject: [PATCH 01/47] making unconditional generation work

The unconditional generation works now but if the seed is fixed, the sample is the same every time.
n_samples > 1 will give different samples though.
I am giving the start token as '<|endoftext|>' for the unconditional generation.
---
 examples/run_gpt2.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 0350747499..0289b26702 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -106,6 +106,23 @@ def run_model():
                 print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                 print(text)
         print("=" * 80)
+    if args.unconditional:
+        generated = 0
+        for _ in range(args.nsamples // args.batch_size):
+            out = sample_sequence(
+                model=model, length=args.length,
+                context=None,
+                start_token=enc.encoder['<|endoftext|>'],
+                batch_size=args.batch_size,
+                temperature=args.temperature, top_k=args.top_k, device=device
+            )
+            out = out[:,1:].tolist()
+            for i in range(args.batch_size):
+                generated += 1
+                text = enc.decode(out[i])
+                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                print(text)
+        print("=" * 80)
 
 if __name__ == '__main__':
     run_model()

From 0d6a882f63ddc1726e43efee0151b5bce3d67eb2 Mon Sep 17 00:00:00 2001
From: dhanajitb <dhana.phoenix@gmail.com>
Date: Sun, 7 Apr 2019 16:54:38 +0530
Subject: [PATCH 02/47] Cleaned some redundant lines

```while not args.unconditional:
   if not args.unconditional:
```
These lines have been updated
---
 examples/run_gpt2.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 0289b26702..f9a1962d26 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -83,29 +83,29 @@ def run_model():
     elif args.length > model.config.n_ctx:
         raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
 
-    while not args.unconditional:
-        if not args.unconditional:
+    if not args.unconditional:
+        while True:
             raw_text = input("Model prompt >>> ")
             while not raw_text:
                 print('Prompt should not be empty!')
                 raw_text = input("Model prompt >>> ")
             context_tokens = enc.encode(raw_text)
-        generated = 0
-        for _ in range(args.nsamples // args.batch_size):
-            out = sample_sequence(
-                model=model, length=args.length,
-                context=context_tokens if not args.unconditional else None,
-                start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
-                batch_size=args.batch_size,
-                temperature=args.temperature, top_k=args.top_k, device=device
-            )
-            out = out[:, len(context_tokens):].tolist()
-            for i in range(args.batch_size):
-                generated += 1
-                text = enc.decode(out[i])
-                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                print(text)
-        print("=" * 80)
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=context_tokens,
+                    start_token=None,
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:, len(context_tokens):].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
     if args.unconditional:
         generated = 0
         for _ in range(args.nsamples // args.batch_size):

From 4d3cf0d6028d7576b8c51ba1eda8403e86b42b05 Mon Sep 17 00:00:00 2001
From: Dhanajit Brahma <dhana.phoenix@gmail.com>
Date: Sun, 7 Apr 2019 16:59:07 +0530
Subject: [PATCH 03/47] removing some redundant lines

---
 examples/run_gpt2.py | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 0289b26702..b22df39b98 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -83,29 +83,29 @@ def run_model():
     elif args.length > model.config.n_ctx:
         raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
 
-    while not args.unconditional:
-        if not args.unconditional:
+    if not args.unconditional:
+        while True:
             raw_text = input("Model prompt >>> ")
             while not raw_text:
                 print('Prompt should not be empty!')
                 raw_text = input("Model prompt >>> ")
             context_tokens = enc.encode(raw_text)
-        generated = 0
-        for _ in range(args.nsamples // args.batch_size):
-            out = sample_sequence(
-                model=model, length=args.length,
-                context=context_tokens if not args.unconditional else None,
-                start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
-                batch_size=args.batch_size,
-                temperature=args.temperature, top_k=args.top_k, device=device
-            )
-            out = out[:, len(context_tokens):].tolist()
-            for i in range(args.batch_size):
-                generated += 1
-                text = enc.decode(out[i])
-                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                print(text)
-        print("=" * 80)
+            generated = 0
+            for _ in range(args.nsamples // args.batch_size):
+                out = sample_sequence(
+                    model=model, length=args.length,
+                    context=context_tokens,
+                    start_token=None,
+                    batch_size=args.batch_size,
+                    temperature=args.temperature, top_k=args.top_k, device=device
+                )
+                out = out[:, len(context_tokens):].tolist()
+                for i in range(args.batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+            print("=" * 80)
     if args.unconditional:
         generated = 0
         for _ in range(args.nsamples // args.batch_size):
@@ -127,3 +127,4 @@ def run_model():
 if __name__ == '__main__':
     run_model()
 
+

From fd8a3556f08bbcfb9c4f3eadea6206751c1b1dd9 Mon Sep 17 00:00:00 2001
From: Benjamin Mann <8enmann@gmail.com>
Date: Mon, 8 Apr 2019 17:20:35 -0700
Subject: [PATCH 04/47] fix run_gpt2.py

---
 examples/run_gpt2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 0350747499..a30c6c6456 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -83,7 +83,8 @@ def run_model():
     elif args.length > model.config.n_ctx:
         raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
 
-    while not args.unconditional:
+    while True:
+        context_tokens = []
         if not args.unconditional:
             raw_text = input("Model prompt >>> ")
             while not raw_text:
@@ -106,6 +107,8 @@ def run_model():
                 print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                 print(text)
         print("=" * 80)
+        if args.unconditional:
+            break
 
 if __name__ == '__main__':
     run_model()

From 8fffba5f475d175dfc9246a5ebc0b99fde115221 Mon Sep 17 00:00:00 2001
From: Yaroslav Bulatov <yaroslavvb@gmail.com>
Date: Tue, 9 Apr 2019 14:45:47 -0700
Subject: [PATCH 05/47] Update README.md

Fix for

```> > > > 04/09/2019 21:39:38 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False
Traceback (most recent call last):
  File "/home/ubuntu/pytorch-pretrained-BERT/examples/lm_finetuning/simple_lm_finetuning.py", line 642, in <module>
    main()
  File "/home/ubuntu/pytorch-pretrained-BERT/examples/lm_finetuning/simple_lm_finetuning.py", line 502, in main
    raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")
ValueError: Training is currently the only implemented execution option. Please set `do_train`.
```
---
 examples/lm_finetuning/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/README.md b/examples/lm_finetuning/README.md
index c48d9b7069..f04e877ef2 100644
--- a/examples/lm_finetuning/README.md
+++ b/examples/lm_finetuning/README.md
@@ -37,6 +37,7 @@ python3 simple_lm_finetuning.py
 --bert_model bert-base-uncased 
 --do_lower_case 
 --output_dir finetuned_lm/
+--do_train
 ```
 
 ### Pregenerating training data
@@ -60,4 +61,4 @@ python3 finetune_on_pregenerated.py
 --do_lower_case
 --output_dir finetuned_lm/
 --epochs 3
-```
\ No newline at end of file
+```

From e99b2014ccaa4a19846ccb5191e63b4bfdb1baa6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Apr 2019 11:43:13 +0200
Subject: [PATCH 06/47] fixes #471

---
 pytorch_pretrained_bert/modeling_openai.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 296abbfc31..7bf643675e 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -371,8 +371,8 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
     def forward(self, hidden_states, mc_token_ids):
         # Classification logits
         # hidden_state (bsz, num_choices, seq_length, hidden_size)
-        # mc_token_ids (bsz, num_choices)
-        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # mc_token_ids (bsz, num_choices, 1)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
         # (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
         # (bsz, num_choices, hidden_size)
@@ -605,14 +605,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             return
         # Update config
         self.config.n_special = num_special_tokens
-        # # Build new embeddings and initialize
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
         old_embed = self.tokens_embed
         self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        # Initialize all new embeddings (in particular the special tokens)
         self.init_weights(self.tokens_embed)
-        # Copy word and positional embeddings from the previous weights
-        self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
-        self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
+        # Copy word embeddings from the previous weights
+        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None):
         if position_ids is None:

From 4a82f4f85685c22b995108909485d822f3e3c607 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Apr 2019 13:11:22 +0200
Subject: [PATCH 07/47] update special token addition

---
 pytorch_pretrained_bert/modeling_openai.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index fb3d0cadb7..feae95d962 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -608,6 +608,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # Build new embeddings and initialize all new embeddings (in particular the special tokens)
         old_embed = self.tokens_embed
         self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        self.tokens_embed.to(old_embed.device.weight.device)
         self.init_weights(self.tokens_embed)
         # Copy word embeddings from the previous weights
         self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]

From a05fad8dcee87087368ad996fe2d76599b406e34 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Apr 2019 13:16:17 +0200
Subject: [PATCH 08/47] fix typo

---
 pytorch_pretrained_bert/modeling_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index feae95d962..1a2a3feb20 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -608,7 +608,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # Build new embeddings and initialize all new embeddings (in particular the special tokens)
         old_embed = self.tokens_embed
         self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.tokens_embed.to(old_embed.device.weight.device)
+        self.tokens_embed.to(old_embed.weight.device)
         self.init_weights(self.tokens_embed)
         # Copy word embeddings from the previous weights
         self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]

From 4bc4c69af92ceef82b1c9df126cc14f0eb7033e8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Apr 2019 16:57:59 +0200
Subject: [PATCH 09/47] finetuning any BERT model - fixes #455

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 035f97b0c9..6a63324502 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -123,9 +123,8 @@ def main():
     parser = ArgumentParser()
     parser.add_argument('--pregenerated_data', type=Path, required=True)
     parser.add_argument('--output_dir', type=Path, required=True)
-    parser.add_argument("--bert_model", type=str, required=True,
-                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
-                                 "bert-base-multilingual", "bert-base-chinese"])
+    parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
     parser.add_argument("--do_lower_case", action="store_true")
     parser.add_argument("--reduce_memory", action="store_true",
                         help="Store training data as on-disc memmaps to massively reduce memory usage")

From 724eb45cef001bf8b73ada5c80494c58c361ef24 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Apr 2019 17:12:00 +0200
Subject: [PATCH 10/47] add stale bot

---
 .github/stale.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .github/stale.yml

diff --git a/.github/stale.yml b/.github/stale.yml
new file mode 100644
index 0000000000..d9f6563218
--- /dev/null
+++ b/.github/stale.yml
@@ -0,0 +1,17 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
\ No newline at end of file

From 074c869bbebd9ad1b8ec1c52ecc506ba982e8483 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Apr 2019 20:53:50 +0200
Subject: [PATCH 11/47] fix OpenAIGPTMultipleChoiceHead

---
 pytorch_pretrained_bert/modeling_openai.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index fb3d0cadb7..b6252d097f 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -371,8 +371,8 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
     def forward(self, hidden_states, mc_token_ids):
         # Classification logits
         # hidden_state (bsz, num_choices, seq_length, hidden_size)
-        # mc_token_ids (bsz, num_choices, 1)
-        mc_token_ids = mc_token_ids.unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # mc_token_ids (bsz, num_choices)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
         # (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
         # (bsz, num_choices, hidden_size)

From c49ce3c722c35324803e40efb88b1a3057c7f249 Mon Sep 17 00:00:00 2001
From: Jie Yang <jieynlp@gmail.com>
Date: Thu, 11 Apr 2019 15:40:19 -0400
Subject: [PATCH 12/47] fix tsv read error in Windows

---
 examples/run_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 751d581ad9..4268c41ec6 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -95,7 +95,7 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with open(input_file, "r", encoding="utf-8") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:

From 1d203a34c06fb8b2c1de856d58950f9d193cc1fc Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Apr 2019 23:51:03 +0200
Subject: [PATCH 13/47] back to simple indexing

---
 pytorch_pretrained_bert/modeling_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 1a2a3feb20..be4f959485 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -372,7 +372,7 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
         # Classification logits
         # hidden_state (bsz, num_choices, seq_length, hidden_size)
         # mc_token_ids (bsz, num_choices, 1)
-        mc_token_ids = mc_token_ids.unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
         # (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
         # (bsz, num_choices, hidden_size)

From b509bf765574852648020d60690386b80e970cf6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Apr 2019 12:12:33 +0200
Subject: [PATCH 14/47] updating loss computation

---
 pytorch_pretrained_bert/modeling_openai.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index be4f959485..c4d20c331e 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -716,9 +716,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
-
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
@@ -808,11 +807,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1].contiguous()
-            shift_labels = lm_labels[:, 1:].contiguous()
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1,
-                          shift_logits.size(-1)), shift_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))

From dbbd6c7500dded778706326c7a1e402cffe97eb8 Mon Sep 17 00:00:00 2001
From: Matthew Carrigan <rocketknight1@gmail.com>
Date: Fri, 12 Apr 2019 15:07:58 +0100
Subject: [PATCH 15/47] Replaced some randints with cleaner randranges, and
 added a helpful error for users whose corpus is just one giant document.

---
 .../pregenerate_training_data.py              | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index 8cc28d2e78..e6c3598a9f 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -4,7 +4,7 @@ from tqdm import tqdm, trange
 from tempfile import TemporaryDirectory
 import shelve
 
-from random import random, randint, shuffle, choice, sample
+from random import random, randrange, randint, shuffle, choice, sample
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 import numpy as np
 import json
@@ -30,6 +30,8 @@ class DocumentDatabase:
         self.reduce_memory = reduce_memory
 
     def add_document(self, document):
+        if not document:
+            return
         if self.reduce_memory:
             current_idx = len(self.doc_lengths)
             self.document_shelf[str(current_idx)] = document
@@ -49,11 +51,11 @@ class DocumentDatabase:
                 self._precalculate_doc_weights()
             rand_start = self.doc_cumsum[current_idx]
             rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
-            sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
+            sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
             sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
         else:
             # If we don't use sentence weighting, then every doc has an equal chance to be chosen
-            sampled_doc_index = current_idx + randint(1, len(self.doc_lengths)-1)
+            sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
         assert sampled_doc_index != current_idx
         if self.reduce_memory:
             return self.document_shelf[str(sampled_doc_index)]
@@ -170,7 +172,7 @@ def create_instances_from_document(
                 # (first) sentence.
                 a_end = 1
                 if len(current_chunk) >= 2:
-                    a_end = randint(1, len(current_chunk) - 1)
+                    a_end = randrange(1, len(current_chunk))
 
                 tokens_a = []
                 for j in range(a_end):
@@ -186,7 +188,7 @@ def create_instances_from_document(
                     # Sample a random document, with longer docs being sampled more frequently
                     random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
 
-                    random_start = randint(0, len(random_document) - 1)
+                    random_start = randrange(0, len(random_document))
                     for j in range(random_start, len(random_document)):
                         tokens_b.extend(random_document[j])
                         if len(tokens_b) >= target_b_length:
@@ -264,6 +266,14 @@ def main():
                 else:
                     tokens = tokenizer.tokenize(line)
                     doc.append(tokens)
+            if doc:
+                docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added
+        if len(docs) <= 1:
+            exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
+                 "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
+                 "indicate breaks between documents in your input file. If your dataset does not contain multiple "
+                 "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
+                 "sections or paragraphs.")
 
         args.output_dir.mkdir(exist_ok=True)
         for epoch in trange(args.epochs_to_generate, desc="Epoch"):

From 34cf67fd6c3690bdc02d15cbc44da272b938c330 Mon Sep 17 00:00:00 2001
From: Martin Boyanov <mboyanov@gmail.com>
Date: Fri, 12 Apr 2019 21:30:28 +0300
Subject: [PATCH 16/47] Extend the BertForSequenceClassification docs to
 mention the special CLS token.

---
 pytorch_pretrained_bert/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2736e34d7f..037c6e9723 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -930,7 +930,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to

From fe2756ff41147ea6de14d8f81ecc5304382af91d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 10:04:05 +0200
Subject: [PATCH 17/47] update double head model

---
 pytorch_pretrained_bert/modeling_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index c4d20c331e..7b95d74f7c 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -371,7 +371,7 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
     def forward(self, hidden_states, mc_token_ids):
         # Classification logits
         # hidden_state (bsz, num_choices, seq_length, hidden_size)
-        # mc_token_ids (bsz, num_choices, 1)
+        # mc_token_ids (bsz, num_choices)
         mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
         # (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)

From 3e65f255dcaf8cac7dabf11adc318756dc5664bb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 11:47:25 +0200
Subject: [PATCH 18/47] add serialization semantics to tokenizers - fix
 transfo-xl tokenizer

---
 examples/run_transfo_xl.py                    |   3 +-
 pytorch_pretrained_bert/tokenization.py       |  13 ++
 pytorch_pretrained_bert/tokenization_gpt2.py  |  16 +++
 .../tokenization_openai.py                    |  16 +++
 .../tokenization_transfo_xl.py                | 129 +++---------------
 5 files changed, 67 insertions(+), 110 deletions(-)

diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index 8139f28baf..0ea7b32053 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 
 import torch
 
-from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
+from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -80,6 +80,7 @@ def main():
     # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
     # and tokenizing the dataset
     # The pre-processed corpus is a convertion (using the conversion script )
+    tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)
     corpus = TransfoXLCorpus.from_pretrained(args.model_name)
     ntokens = len(corpus.vocab)
 
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index bbb3e25fc7..6e2e11ed92 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -134,6 +134,19 @@ class BertTokenizer(object):
             tokens.append(self.ids_to_tokens[i])
         return tokens
 
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a path."""
+        index = 0
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index db95719dbc..07db995b96 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -187,6 +187,22 @@ class GPT2Tokenizer(object):
         self.cache[token] = word
         return word
 
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a path."""
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        json.dump(self.encoder, vocab_file)
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(bpe_tokens + u'\n')
+                index += 1
+
     def encode(self, text):
         bpe_tokens = []
         for token in re.findall(self.pat, text):
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 240122d12d..aa0438ccf8 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -261,3 +261,19 @@ class OpenAIGPTTokenizer(object):
                     ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
                     ).replace(" 've", "'ve")
         return out_string
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a path."""
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        json.dump(self.encoder, vocab_file)
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(bpe_tokens + u'\n')
+                index += 1
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index b5360c5184..b6470c7667 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -63,7 +63,10 @@ class TransfoXLTokenizer(object):
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            if os.path.isdir(pretrained_model_name_or_path):
+                vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            else:
+                vocab_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
@@ -141,6 +144,11 @@ class TransfoXLTokenizer(object):
         else:
             raise ValueError('No <unkown> token in vocabulary')
 
+    def save_vocabulary(self, vocab_path):
+        index = 0
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        torch.save(self.__dict__, vocab_file)
+
     def build_vocab(self):
         if self.vocab_file:
             print('building vocab from {}'.format(self.vocab_file))
@@ -245,82 +253,24 @@ class TransfoXLTokenizer(object):
     def __len__(self):
         return len(self.idx2sym)
 
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        if text in self.never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def whitespace_tokenize(self, text):
-        """Runs basic whitespace cleaning and splitting on a piece of text."""
-        text = text.strip()
-        if not text:
-            return []
-        if self.delimiter == '':
-            tokens = text
-        else:
-            tokens = text.split(self.delimiter)
-        return tokens
-
     def tokenize(self, line, add_eos=False, add_double_eos=False):
-        line = self._clean_text(line)
         line = line.strip()
+        # convert to lower case
+        if self.lower_case:
+            line = line.lower()
 
-        symbols = self.whitespace_tokenize(line)
-
-        split_symbols = []
-        for symbol in symbols:
-            if self.lower_case and symbol not in self.never_split:
-                symbol = symbol.lower()
-                symbol = self._run_strip_accents(symbol)
-            split_symbols.extend(self._run_split_on_punc(symbol))
+        # empty delimiter '' will evaluate False
+        if self.delimiter == '':
+            symbols = line
+        else:
+            symbols = line.split(self.delimiter)
 
         if add_double_eos: # lm1b
-            return ['<S>'] + split_symbols + ['<S>']
+            return ['<S>'] + symbols + ['<S>']
         elif add_eos:
-            return split_symbols + ['<eos>']
+            return symbols + ['<eos>']
         else:
-            return split_symbols
+            return symbols
 
 
 class LMOrderedIterator(object):
@@ -631,42 +581,3 @@ def get_lm_corpus(datadir, dataset):
         torch.save(corpus, fn)
 
     return corpus
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False

From 870b734bfd2cc83e43b29050fba03709a0c5b539 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 12:03:56 +0200
Subject: [PATCH 19/47] added tokenizers serialization tests

---
 pytorch_pretrained_bert/tokenization.py       |  1 +
 pytorch_pretrained_bert/tokenization_gpt2.py  |  6 ++-
 .../tokenization_openai.py                    |  6 ++-
 .../tokenization_transfo_xl.py                |  1 +
 tests/tokenization_openai_test.py             | 16 +++++++
 tests/tokenization_test.py                    | 11 +++++
 tests/tokenization_transfo_xl_test.py         | 42 ++++++-------------
 7 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 6e2e11ed92..8fd65f55f0 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -146,6 +146,7 @@ class BertTokenizer(object):
                     index = token_index
                 writer.write(token + u'\n')
                 index += 1
+        return vocab_file
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 07db995b96..b49e1310e4 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -188,7 +188,10 @@ class GPT2Tokenizer(object):
         return word
 
     def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a path."""
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
         vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         merge_file = os.path.join(vocab_path, MERGES_NAME)
         json.dump(self.encoder, vocab_file)
@@ -202,6 +205,7 @@ class GPT2Tokenizer(object):
                     index = token_index
                 writer.write(bpe_tokens + u'\n')
                 index += 1
+        return vocab_file, merge_file
 
     def encode(self, text):
         bpe_tokens = []
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index aa0438ccf8..f3ce7ab251 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -263,7 +263,10 @@ class OpenAIGPTTokenizer(object):
         return out_string
 
     def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a path."""
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
         vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         merge_file = os.path.join(vocab_path, MERGES_NAME)
         json.dump(self.encoder, vocab_file)
@@ -277,3 +280,4 @@ class OpenAIGPTTokenizer(object):
                     index = token_index
                 writer.write(bpe_tokens + u'\n')
                 index += 1
+        return vocab_file, merge_file
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index b6470c7667..f704a035db 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -148,6 +148,7 @@ class TransfoXLTokenizer(object):
         index = 0
         vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         torch.save(self.__dict__, vocab_file)
+        return vocab_file
 
     def build_vocab(self):
         if self.vocab_file:
diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
index 6213eb1b03..2b1bdd3a9a 100644
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -52,5 +52,21 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
+        vocab_file, merges_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
+        tokenizer.from_pretrained("/tmp/")
+        os.remove(vocab_file)
+        os.remove(merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py
index 78e145ffd2..15cc7ccd82 100644
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -46,6 +46,17 @@ class TokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
+        vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
+        tokenizer.from_pretrained(vocab_file)
+        os.remove(vocab_file)
+
+        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+
+
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py
index 9ff04f5f34..add2eb4e71 100644
--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py
@@ -18,9 +18,7 @@ import os
 import unittest
 from io import open
 
-from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer,
-                                                  _is_control, _is_punctuation,
-                                                  _is_whitespace)
+from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
 
 
 class TransfoXLTokenizationTest(unittest.TestCase):
@@ -43,6 +41,17 @@ class TransfoXLTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
+        vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
+        tokenizer.from_pretrained(vocab_file)
+        os.remove(vocab_file)
+
+        tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+
+
     def test_full_tokenizer_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=True)
 
@@ -58,33 +67,6 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(u" "))
-        self.assertTrue(_is_whitespace(u"\t"))
-        self.assertTrue(_is_whitespace(u"\r"))
-        self.assertTrue(_is_whitespace(u"\n"))
-        self.assertTrue(_is_whitespace(u"\u00A0"))
-
-        self.assertFalse(_is_whitespace(u"A"))
-        self.assertFalse(_is_whitespace(u"-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control(u"\u0005"))
-
-        self.assertFalse(_is_control(u"A"))
-        self.assertFalse(_is_control(u" "))
-        self.assertFalse(_is_control(u"\t"))
-        self.assertFalse(_is_control(u"\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation(u"-"))
-        self.assertTrue(_is_punctuation(u"$"))
-        self.assertTrue(_is_punctuation(u"`"))
-        self.assertTrue(_is_punctuation(u"."))
-
-        self.assertFalse(_is_punctuation(u"A"))
-        self.assertFalse(_is_punctuation(u" "))
-
 
 if __name__ == '__main__':
     unittest.main()

From e8568a3b17454dd4e0b32b6cd80617aa662cc996 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 12:55:38 +0200
Subject: [PATCH 20/47] fixing tests

---
 pytorch_pretrained_bert/tokenization_gpt2.py  | 27 ++++++++++++++++---
 .../tokenization_openai.py                    | 27 ++++++++++++++++---
 tests/tokenization_openai_test.py             |  2 +-
 tests/tokenization_transfo_xl_test.py         |  9 +++----
 4 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index b49e1310e4..ab80876ee5 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -45,6 +45,7 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 }
 VOCAB_NAME = 'vocab.json'
 MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 @lru_cache()
 def bytes_to_unicode():
@@ -97,6 +98,11 @@ class GPT2Tokenizer(object):
         else:
             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
             merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
         # redirect to the cache, if necessary
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
@@ -125,7 +131,11 @@ class GPT2Tokenizer(object):
             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
         # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
         return tokenizer
 
     def __init__(self, vocab_file, merges_file, errors='replace', max_len=None):
@@ -194,7 +204,11 @@ class GPT2Tokenizer(object):
             return
         vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         merge_file = os.path.join(vocab_path, MERGES_NAME)
-        json.dump(self.encoder, vocab_file)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
             writer.write(u'#version: 0.2\n')
@@ -203,9 +217,14 @@ class GPT2Tokenizer(object):
                     logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
                                    " Please check that the tokenizer is not corrupted!".format(merge_file))
                     index = token_index
-                writer.write(bpe_tokens + u'\n')
+                writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
-        return vocab_file, merge_file
+
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]):
+                writer.write(token + u'\n')
+
+        return vocab_file, merge_file, special_tokens_file
 
     def encode(self, text):
         bpe_tokens = []
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index f3ce7ab251..d9713e51eb 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -41,6 +41,7 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 }
 VOCAB_NAME = 'vocab.json'
 MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 def get_pairs(word):
     """
@@ -89,6 +90,11 @@ class OpenAIGPTTokenizer(object):
         else:
             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
             merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
         # redirect to the cache, if necessary
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
@@ -117,7 +123,11 @@ class OpenAIGPTTokenizer(object):
             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
         # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
         return tokenizer
 
     def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
@@ -269,7 +279,11 @@ class OpenAIGPTTokenizer(object):
             return
         vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         merge_file = os.path.join(vocab_path, MERGES_NAME)
-        json.dump(self.encoder, vocab_file)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
             writer.write(u'#version: 0.2\n')
@@ -278,6 +292,11 @@ class OpenAIGPTTokenizer(object):
                     logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
                                    " Please check that the tokenizer is not corrupted!".format(merge_file))
                     index = token_index
-                writer.write(bpe_tokens + u'\n')
+                writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
-        return vocab_file, merge_file
+
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]):
+                writer.write(token + u'\n')
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
index 2b1bdd3a9a..1f695cfb12 100644
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -52,7 +52,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-        vocab_file, merges_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
+        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
         tokenizer.from_pretrained("/tmp/")
         os.remove(vocab_file)
         os.remove(merges_file)
diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py
index add2eb4e71..1a805f11e6 100644
--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py
@@ -35,7 +35,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
         tokenizer.build_vocab()
         os.remove(vocab_file)
 
-        tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
 
         self.assertListEqual(
@@ -45,7 +45,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
         tokenizer.from_pretrained(vocab_file)
         os.remove(vocab_file)
 
-        tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
 
         self.assertListEqual(
@@ -56,15 +56,14 @@ class TransfoXLTokenizationTest(unittest.TestCase):
         tokenizer = TransfoXLTokenizer(lower_case=True)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
             ["hello", "!", "how", "are", "you", "?"])
-        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
 
     def test_full_tokenizer_no_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=False)
 
         self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 
 

From b17963d82ffa1355d222d3377594e61a25acd7aa Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 13:44:30 +0200
Subject: [PATCH 21/47] update readme

---
 README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index daac69de9f..1e192941f0 100644
--- a/README.md
+++ b/README.md
@@ -796,8 +796,7 @@ This model *outputs*:
   - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
   - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
 
-
-### Tokenizers:
+### Tokenizers
 
 #### `BertTokenizer`
 
@@ -816,6 +815,7 @@ and three methods:
 - `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
 - `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
 - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
+- `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: `vocab_file_path`. The vocabulary can be reloaded with `BertTokenizer.from_pretrained('vocab_file_path')` or `BertTokenizer.from_pretrained('directory_path')`.
 
 Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
 
@@ -837,6 +837,7 @@ and five methods:
 - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
 - `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
 - `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
+- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
 Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
 
@@ -844,6 +845,8 @@ Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch
 
 `TransfoXLTokenizer` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper ([Efficient softmax approximation for GPUs](http://arxiv.org/abs/1609.04309)) for more details.
 
+The API is similar to the API of `BertTokenizer` (see above).
+
 Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
 
 #### `GPT2Tokenizer`
@@ -860,11 +863,11 @@ and two methods:
 
 - `encode(text)`: convert a `str` in a list of `int` tokens by performing byte-level BPE.
 - `decode(tokens)`: convert back a list of `int` tokens in a `str`.
+- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
 Please refer to [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
 
-
-### Optimizers:
+### Optimizers
 
 #### `BertAdam`
 

From 9761aa48452712711d6b2ff04902b8a37ff294b3 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 14:12:08 +0200
Subject: [PATCH 22/47] add to_json_file method to configuration classes

---
 pytorch_pretrained_bert/modeling.py            | 5 +++++
 pytorch_pretrained_bert/modeling_gpt2.py       | 5 +++++
 pytorch_pretrained_bert/modeling_openai.py     | 5 +++++
 pytorch_pretrained_bert/modeling_transfo_xl.py | 5 +++++
 tests/modeling_gpt2_test.py                    | 9 +++++++++
 tests/modeling_openai_test.py                  | 9 +++++++++
 tests/modeling_test.py                         | 9 +++++++++
 tests/modeling_transfo_xl_test.py              | 9 +++++++++
 8 files changed, 56 insertions(+)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2736e34d7f..6a71cbeea6 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -220,6 +220,11 @@ class BertConfig(object):
         """Serializes this instance to a JSON string."""
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 except ImportError:
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 7b00ce7730..fce564e9ea 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -180,6 +180,11 @@ class GPT2Config(object):
         """Serializes this instance to a JSON string."""
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
 
 class Conv1D(nn.Module):
     def __init__(self, nf, nx):
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index b6252d097f..33bb4472a5 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -225,6 +225,11 @@ class OpenAIGPTConfig(object):
         """Serializes this instance to a JSON string."""
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
 
 class Conv1D(nn.Module):
     def __init__(self, nf, rf, nx):
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index ac895a03a7..0ba986f5b4 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -316,6 +316,11 @@ class TransfoXLConfig(object):
         """Serializes this instance to a JSON string."""
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
 
 class PositionalEmbedding(nn.Module):
     def __init__(self, demb):
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index 12a539c44b..d542422060 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -16,6 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import unittest
 import json
 import random
@@ -176,6 +177,14 @@ class GPT2ModelTest(unittest.TestCase):
         self.assertEqual(obj["vocab_size"], 99)
         self.assertEqual(obj["n_embd"], 37)
 
+    def test_config_to_json_file(self):
+        config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
+        json_file_path = "/tmp/config.json"
+        config_first.to_json_file(json_file_path)
+        config_second = GPT2Config.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_gpt2_model(*config_and_inputs)
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 1cc8b7d5dc..db03bf792e 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -16,6 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import unittest
 import json
 import random
@@ -188,6 +189,14 @@ class OpenAIGPTModelTest(unittest.TestCase):
         self.assertEqual(obj["vocab_size"], 99)
         self.assertEqual(obj["n_embd"], 37)
 
+    def test_config_to_json_file(self):
+        config_first = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
+        json_file_path = "/tmp/config.json"
+        config_first.to_json_file(json_file_path)
+        config_second = OpenAIGPTConfig.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_openai_model(*config_and_inputs)
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index c7a031cfb0..02d7a13fda 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -16,6 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import unittest
 import json
 import random
@@ -251,6 +252,14 @@ class BertModelTest(unittest.TestCase):
         self.assertEqual(obj["vocab_size"], 99)
         self.assertEqual(obj["hidden_size"], 37)
 
+    def test_config_to_json_file(self):
+        config_first = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37)
+        json_file_path = "/tmp/config.json"
+        config_first.to_json_file(json_file_path)
+        config_second = BertConfig.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_bert_model(*config_and_inputs)
diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py
index 291d5d9d2a..a59d90b205 100644
--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
@@ -16,6 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import unittest
 import json
 import random
@@ -186,6 +187,14 @@ class TransfoXLModelTest(unittest.TestCase):
         self.assertEqual(obj["n_token"], 96)
         self.assertEqual(obj["d_embed"], 37)
 
+    def test_config_to_json_file(self):
+        config_first = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
+        json_file_path = "/tmp/config.json"
+        config_first.to_json_file(json_file_path)
+        config_second = TransfoXLConfig.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
 

From 20577d8a7cb7dd38d3c5295c6f44bf377435e608 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 14:21:41 +0200
Subject: [PATCH 23/47] add configuration serialization to readme

---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 1e192941f0..2a59bb0d37 100644
--- a/README.md
+++ b/README.md
@@ -131,6 +131,7 @@ This package comprises the following classes that can be imported in Python and
 - Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
   - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
+  - `GPT2Config` - Configuration class to store the configuration of a `GPT2Model` with utilities to read and write from JSON configuration files.
   - `TransfoXLConfig` - Configuration class to store the configuration of a `TransfoXLModel` with utilities to read and write from JSON configuration files.
 
 The repository further comprises:
@@ -524,6 +525,23 @@ model = GPT2Model.from_pretrained('gpt2')
 
 ```
 
+### Configuration classes
+
+Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are:
+
+- `BertConfig` for `BertModel` and BERT classes instances.
+- `OpenAIGPTConfig` for `OpenAIGPTModel` and OpenAI GPT classes instances.
+- `GPT2Config` for `GPT2Model` and OpenAI GPT-2 classes instances.
+- `TransfoXLConfig` for `TransfoXLModel` and Transformer-XL classes instances.
+
+These configuration classes contains a few utilities to load and save configurations:
+
+- `from_dict(cls, json_object)`: A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
+- `from_json_file(cls, json_file)`: A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
+- `to_dict()`: Serializes an instance to a Python dictionary. Returns a dictionary.
+- `to_json_string()`: Serializes an instance to a JSON string. Returns a string.
+- `to_json_file(json_file_path)`: Save an instance to a json file.
+
 ### PyTorch models
 
 #### 1. `BertModel`

From b3c6ee0ac1cd95bcd0a54a36a29daf599f389f93 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 14:24:52 +0200
Subject: [PATCH 24/47] tokenization updates

---
 pytorch_pretrained_bert/tokenization.py            | 5 +++--
 pytorch_pretrained_bert/tokenization_transfo_xl.py | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 8fd65f55f0..3937d6e011 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -135,9 +135,10 @@ class BertTokenizer(object):
         return tokens
 
     def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a path."""
+        """Save the tokenizer vocabulary to a directory or file."""
         index = 0
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index f704a035db..ddebc57c10 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -145,8 +145,10 @@ class TransfoXLTokenizer(object):
             raise ValueError('No <unkown> token in vocabulary')
 
     def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
         index = 0
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         torch.save(self.__dict__, vocab_file)
         return vocab_file
 

From 179a2c2ff66ebf147c562243d7f4b6f37c0cdd23 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 14:33:23 +0200
Subject: [PATCH 25/47] update example to work with new serialization semantic

---
 examples/run_classifier.py | 25 ++++++++++++++-----------
 examples/run_openai_gpt.py | 20 +++++++++++++-------
 examples/run_squad.py      | 25 ++++++++++++++-----------
 examples/run_swag.py       | 25 ++++++++++++++-----------
 4 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 751d581ad9..ba49d18b8d 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -37,7 +37,7 @@ from sklearn.metrics import matthews_corrcoef, f1_score
 
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -857,18 +857,21 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-        # Save a trained model and the associated configuration
+        # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        torch.save(model_to_save.state_dict(), output_model_file)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-        with open(output_config_file, 'w') as f:
-            f.write(model_to_save.config.to_json_string())
 
-        # Load a trained model and config that you have fine-tuned
-        config = BertConfig(output_config_file)
-        model = BertForSequenceClassification(config, num_labels=num_labels)
-        model.load_state_dict(torch.load(output_model_file))
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(output_vocab_file)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
     else:
         model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
     model.to(device)
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index ee30a7a0a4..1686c9eda6 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -40,6 +40,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
 from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
+from pytorch_pretrained_bert.modeling_openai import WEIGHTS_NAME, CONFIG_NAME
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
 
@@ -218,15 +219,20 @@ def main():
 
     # Save a trained model
     if args.do_train:
+        # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
-        config = model.config
-        torch.save(model_to_save.state_dict(), output_model_file)
 
-        # Load a trained model that you have fine-tuned
-        model_state_dict = torch.load(output_model_file)
-        model = OpenAIGPTDoubleHeadsModel(config)
-        model.load_state_dict(model_state_dict)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
+        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
         model.to(device)
 
     if args.do_eval:
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 043b795326..045c0afe1e 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -39,7 +39,7 @@ from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfi
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                   BertTokenizer,
-                                                  whitespace_tokenize)
+                                                  whitespace_tokenize, VOCAB_NAME)
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -1009,18 +1009,21 @@ def main():
                     global_step += 1
 
     if args.do_train:
-        # Save a trained model and the associated configuration
+        # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        torch.save(model_to_save.state_dict(), output_model_file)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-        with open(output_config_file, 'w') as f:
-            f.write(model_to_save.config.to_json_string())
 
-        # Load a trained model and config that you have fine-tuned
-        config = BertConfig(output_config_file)
-        model = BertForQuestionAnswering(config)
-        model.load_state_dict(torch.load(output_model_file))
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(output_vocab_file)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
     else:
         model = BertForQuestionAnswering.from_pretrained(args.bert_model)
 
diff --git a/examples/run_swag.py b/examples/run_swag.py
index f193582640..fa145c29d7 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -35,7 +35,7 @@ from tqdm import tqdm, trange
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from pytorch_pretrained_bert.modeling import (BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME)
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -473,18 +473,21 @@ def main():
 
 
     if args.do_train:
-        # Save a trained model and the associated configuration
+        # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        torch.save(model_to_save.state_dict(), output_model_file)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-        with open(output_config_file, 'w') as f:
-            f.write(model_to_save.config.to_json_string())
 
-        # Load a trained model and config that you have fine-tuned
-        config = BertConfig(output_config_file)
-        model = BertForMultipleChoice(config, num_choices=4)
-        model.load_state_dict(torch.load(output_model_file))
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(output_vocab_file)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
     else:
         model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
     model.to(device)

From 60ea6c59d24f63681e120e704d2f823bfcc2c04e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 15:00:33 +0200
Subject: [PATCH 26/47] added best practices for serialization in README and
 examples

---
 README.md                                     | 76 +++++++++++++++++++
 examples/run_classifier.py                    | 11 ++-
 examples/run_openai_gpt.py                    |  4 +-
 examples/run_squad.py                         | 11 ++-
 examples/run_swag.py                          | 11 ++-
 pytorch_pretrained_bert/__init__.py           |  2 +-
 pytorch_pretrained_bert/file_utils.py         |  3 +
 pytorch_pretrained_bert/modeling.py           |  8 +-
 pytorch_pretrained_bert/modeling_gpt2.py      |  5 +-
 pytorch_pretrained_bert/modeling_openai.py    |  4 +-
 .../modeling_transfo_xl.py                    |  5 +-
 11 files changed, 106 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 2a59bb0d37..2f725f1786 100644
--- a/README.md
+++ b/README.md
@@ -525,6 +525,82 @@ model = GPT2Model.from_pretrained('gpt2')
 
 ```
 
+### Serialization best-practices: saving and re-loading a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL)
+
+There are three types of files you need to save to be able to reload a fine-tuned model:
+
+- the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices),
+- the configuration file of the model which is saved as a JSON file, and
+- the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
+
+Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
+
+```python
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
+
+output_dir = "./models/"
+
+# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+# If we have a distributed model, save only the encapsulated model
+# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+model_to_save = model.module if hasattr(model, 'module') else model
+
+# If we save using the predefined names, we can load using `from_pretrained`
+output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+output_config_file = os.path.join(output_dir, CONFIG_NAME)
+
+torch.save(model_to_save.state_dict(), output_model_file)
+model_to_save.config.to_json_file(output_config_file)
+tokenizer.save_vocabulary(output_dir)
+
+# Step 2: Re-load the saved model and vocabulary
+
+# Example for a Bert model
+model = BertForQuestionAnswering.from_pretrained(output_dir)
+tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+# Example for a GPT model
+model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
+tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
+```
+
+Here is another way you can save and reload the model if you want to use specific paths for each type of files:
+
+```python
+output_model_file = "./models/my_own_model_file.bin"
+output_config_file = "./models/my_own_config_file.bin"
+output_vocab_file = "./models/my_own_vocab_file.bin"
+
+# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+# If we have a distributed model, save only the encapsulated model
+# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+model_to_save = model.module if hasattr(model, 'module') else model
+
+torch.save(model_to_save.state_dict(), output_model_file)
+model_to_save.config.to_json_file(output_config_file)
+tokenizer.save_vocabulary(output_vocab_file)
+
+# Step 2: Re-load the saved model and vocabulary
+
+# We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
+# Here is how to do it in this situation:
+
+# Example for a Bert model
+config = BertConfig.from_json_file(output_config_file)
+model = BertForQuestionAnswering(config)
+state_dict = torch.load(output_model_file)
+model.load_state_dict(state_dict)
+tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
+
+# Example for a GPT model
+config = OpenAIGPTConfig.from_json_file(output_config_file)
+model = OpenAIGPTDoubleHeadsModel(config)
+state_dict = torch.load(output_model_file)
+model.load_state_dict(state_dict)
+tokenizer = OpenAIGPTTokenizer(output_vocab_file)
+```
+
 ### Configuration classes
 
 Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are:
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index ba49d18b8d..46a428b3b8 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -35,9 +35,9 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import matthews_corrcoef, f1_score
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -863,15 +863,14 @@ def main():
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
         output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-        output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME)
 
         torch.save(model_to_save.state_dict(), output_model_file)
         model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(output_vocab_file)
+        tokenizer.save_vocabulary(args.output_dir)
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
     else:
         model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
     model.to(device)
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 1686c9eda6..cb5aa8d9cb 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -39,8 +39,8 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
-from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
-from pytorch_pretrained_bert.modeling_openai import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 045c0afe1e..14e6bd7ab8 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -34,12 +34,12 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                   BertTokenizer,
-                                                  whitespace_tokenize, VOCAB_NAME)
+                                                  whitespace_tokenize)
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -1015,15 +1015,14 @@ def main():
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
         output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-        output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME)
 
         torch.save(model_to_save.state_dict(), output_model_file)
         model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(output_vocab_file)
+        tokenizer.save_vocabulary(args.output_dir)
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
     else:
         model = BertForQuestionAnswering.from_pretrained(args.bert_model)
 
diff --git a/examples/run_swag.py b/examples/run_swag.py
index fa145c29d7..a6cfdbe311 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -32,10 +32,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from pytorch_pretrained_bert.modeling import (BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME)
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
-from pytorch_pretrained_bert.tokenization import BertTokenizer, VOCAB_NAME
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -479,15 +479,14 @@ def main():
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
         output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-        output_vocab_file = os.path.join(args.output_dir, VOCAB_NAME)
 
         torch.save(model_to_save.state_dict(), output_model_file)
         model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(output_vocab_file)
+        tokenizer.save_vocabulary(args.output_dir)
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
     else:
         model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
     model.to(device)
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index bd455b8d9c..28d215d8bd 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -21,4 +21,4 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model,
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
-from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
+from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 8601edde23..6de7e259e5 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -33,6 +33,9 @@ except (AttributeError, ImportError):
     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                               os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
 
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 6a71cbeea6..dca6ac53f2 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -32,7 +32,7 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from .file_utils import cached_path
+from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
 
 logger = logging.getLogger(__name__)
 
@@ -45,8 +45,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
 }
-CONFIG_NAME = 'bert_config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
+BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
 def load_tf_weights_in_bert(model, tf_checkpoint_path):
@@ -586,6 +585,9 @@ class BertPreTrainedModel(nn.Module):
             serialization_dir = tempdir
         # Load config
         config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        if not os.path.exists(config_file):
+            # Backward compatibility with old naming format
+            config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
         config = BertConfig.from_json_file(config_file)
         logger.info("Model config {}".format(config))
         # Instantiate model.
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index fce564e9ea..e6017d33e4 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -34,7 +34,7 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
+from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
 from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -42,9 +42,6 @@ logger = logging.getLogger(__name__)
 PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
 PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
 
-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "pytorch_model.bin"
-
 def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 33bb4472a5..57a7921d7a 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -34,7 +34,7 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
+from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
 from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -42,8 +42,6 @@ logger = logging.getLogger(__name__)
 PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
 PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
 
-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "pytorch_model.bin"
 
 def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
     """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 0ba986f5b4..0b732cdef1 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -40,7 +40,7 @@ from torch.nn.parameter import Parameter
 
 from .modeling import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .file_utils import cached_path
+from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
 
 logger = logging.getLogger(__name__)
 
@@ -50,8 +50,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
 PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 }
-CONFIG_NAME = 'config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
+
 TF_WEIGHTS_NAME = 'model.ckpt'
 
 def build_tf_to_pytorch_map(model, config):

From cc433070238d8e3c093b12cb2b9ba34028adce93 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 15:06:10 +0200
Subject: [PATCH 27/47] update readme

---
 README.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2f725f1786..0a61992efc 100644
--- a/README.md
+++ b/README.md
@@ -462,10 +462,12 @@ Here is a detailed documentation of the classes in the package and how to use th
 
 | Sub-section | Description |
 |-|-|
-| [Loading Google AI's/OpenAI's pre-trained weights](#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance |
-| [PyTorch models](#PyTorch-models) | API of the BERT, GPT, GPT-2 and Transformer-XL PyTorch model classes |
+| [Loading pre-trained weights](#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance |
+| [Serialization best-practices](#serialization-best-practices) | How to save and reload a fine-tuned model |
+| [Configurations](#configurations) | API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL |
+| [Models](#models) | API of the PyTorch model classes for BERT, GPT, GPT-2 and Transformer-XL |
 | [Tokenizers](#tokenizers) | API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL|
-| [Optimizers](#optimizerss) |  API of the optimizers |
+| [Optimizers](#optimizers) |  API of the optimizers |
 
 ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
 
@@ -525,8 +527,9 @@ model = GPT2Model.from_pretrained('gpt2')
 
 ```
 
-### Serialization best-practices: saving and re-loading a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL)
+### Serialization best-practices
 
+This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
 There are three types of files you need to save to be able to reload a fine-tuned model:
 
 - the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices),
@@ -601,7 +604,7 @@ model.load_state_dict(state_dict)
 tokenizer = OpenAIGPTTokenizer(output_vocab_file)
 ```
 
-### Configuration classes
+### Configurations
 
 Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are:
 
@@ -618,7 +621,7 @@ These configuration classes contains a few utilities to load and save configurat
 - `to_json_string()`: Serializes an instance to a JSON string. Returns a string.
 - `to_json_file(json_file_path)`: Save an instance to a json file.
 
-### PyTorch models
+### Models
 
 #### 1. `BertModel`
 

From 1135f2384ab735759e8c1a0643dba938e4e609ea Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 15:22:40 +0200
Subject: [PATCH 28/47] clean up logger in examples for distributed case

---
 README.md                  | 16 ++++++++++------
 examples/run_classifier.py |  8 +++++---
 examples/run_squad.py      |  8 +++++---
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0a61992efc..caf415508f 100644
--- a/README.md
+++ b/README.md
@@ -1274,18 +1274,20 @@ To get these results we used a combination of:
 
 Here is the full list of hyper-parameters for this run:
 ```bash
+export SQUAD_DIR=/path/to/SQUAD
+
 python ./run_squad.py \
   --bert_model bert-large-uncased \
   --do_train \
   --do_predict \
   --do_lower_case \
-  --train_file $SQUAD_TRAIN \
-  --predict_file $SQUAD_EVAL \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
   --learning_rate 3e-5 \
   --num_train_epochs 2 \
   --max_seq_length 384 \
   --doc_stride 128 \
-  --output_dir $OUTPUT_DIR \
+  --output_dir /tmp/debug_squad/ \
   --train_batch_size 24 \
   --gradient_accumulation_steps 2
 ```
@@ -1294,18 +1296,20 @@ If you have a recent GPU (starting from NVIDIA Volta series), you should try **1
 
 Here is an example of hyper-parameters for a FP16 run we tried:
 ```bash
+export SQUAD_DIR=/path/to/SQUAD
+
 python ./run_squad.py \
   --bert_model bert-large-uncased \
   --do_train \
   --do_predict \
   --do_lower_case \
-  --train_file $SQUAD_TRAIN \
-  --predict_file $SQUAD_EVAL \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
   --learning_rate 3e-5 \
   --num_train_epochs 2 \
   --max_seq_length 384 \
   --doc_stride 128 \
-  --output_dir $OUTPUT_DIR \
+  --output_dir /tmp/debug_squad/ \
   --train_batch_size 24 \
   --fp16 \
   --loss_scale 128
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 46a428b3b8..112be6fbcb 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -40,9 +40,6 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification, Bert
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -697,6 +694,11 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
+
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+
     logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
         device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 14e6bd7ab8..00ee368b14 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -46,9 +46,6 @@ if sys.version_info[0] == 2:
 else:
     import pickle
 
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -848,6 +845,11 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
+
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+
     logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
         device, n_gpu, bool(args.local_rank != -1), args.fp16))
 

From 7816f7921fd5a21fdc74ca0f29589c74bceed0e2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 15:27:10 +0200
Subject: [PATCH 29/47] clean up distributed training logging in run_squad
 example

---
 examples/run_squad.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 00ee368b14..bad46203bc 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -985,7 +985,7 @@ def main():
 
         model.train()
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                 if n_gpu == 1:
                     batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
                 input_ids, input_mask, segment_ids, start_positions, end_positions = batch
@@ -1058,7 +1058,7 @@ def main():
         model.eval()
         all_results = []
         logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
+        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
             if len(all_results) % 1000 == 0:
                 logger.info("Processing example: %d" % (len(all_results)))
             input_ids = input_ids.to(device)

From 2499b0a5fcdb168ccb0095e837b2022953935af2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 15:33:04 +0200
Subject: [PATCH 30/47] add ptvsd to run_squad

---
 examples/run_squad.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index bad46203bc..cd85219f5f 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -834,7 +834,17 @@ def main():
     parser.add_argument('--null_score_diff_threshold',
                         type=float, default=0.0,
                         help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
+    print(args)
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
 
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

From df5d9c3551a6405feb697a1cad903dddffa04bfe Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 15:43:01 +0200
Subject: [PATCH 31/47] load all models on cpu

---
 pytorch_pretrained_bert/modeling.py            | 2 +-
 pytorch_pretrained_bert/modeling_gpt2.py       | 2 +-
 pytorch_pretrained_bert/modeling_openai.py     | 2 +-
 pytorch_pretrained_bert/modeling_transfo_xl.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index dca6ac53f2..8dfb5fe51e 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -594,7 +594,7 @@ class BertPreTrainedModel(nn.Module):
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
+            state_dict = torch.load(weights_path, map_location='cpu')
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index e6017d33e4..7cf1e6b59d 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -418,7 +418,7 @@ class GPT2PreTrainedModel(nn.Module):
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
             # Directly load from a TensorFlow checkpoint (stored as NumPy array)
             return load_tf_weights_in_gpt2(model, resolved_archive_file)
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 57a7921d7a..3dedc53f11 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -476,7 +476,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
             # Directly load from a TensorFlow checkpoint (stored as NumPy array)
             return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 0b732cdef1..e8fffc5b60 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -944,7 +944,7 @@ class TransfoXLPreTrainedModel(nn.Module):
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
             # Directly load from a TensorFlow checkpoint
             return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path)

From d61602245566b1e42dca9238b3b8a0f23f3fdad1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 16:07:45 +0200
Subject: [PATCH 32/47] fix openai special tokens loading

---
 pytorch_pretrained_bert/tokenization_openai.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index d9713e51eb..7a10271175 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -87,6 +87,7 @@ class OpenAIGPTTokenizer(object):
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
             merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
         else:
             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
             merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)

From 3571187ef6f07a7ba63ee5b355e312f2fbfaaab7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Apr 2019 16:43:56 +0200
Subject: [PATCH 33/47] fix saving models in distributed setting examples

---
 examples/run_classifier.py | 1 +
 examples/run_squad.py      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 112be6fbcb..4994118467 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -859,6 +859,7 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index cd85219f5f..410fd85298 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -1020,7 +1020,7 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-    if args.do_train:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
 

From 18a8a15f78a10ac6bf272bc762232b3f16df30e2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Apr 2019 17:00:55 +0200
Subject: [PATCH 34/47] improving GPT2 tokenization and adding tests

---
 README.md                                     |  7 +-
 pytorch_pretrained_bert/tokenization_gpt2.py  | 96 +++++++++++++++----
 .../tokenization_openai.py                    | 15 ++-
 tests/tokenization_gpt2_test.py               | 68 +++++++++++++
 tests/tokenization_openai_test.py             | 17 ++--
 5 files changed, 169 insertions(+), 34 deletions(-)
 create mode 100644 tests/tokenization_gpt2_test.py

diff --git a/README.md b/README.md
index caf415508f..fde35d23ea 100644
--- a/README.md
+++ b/README.md
@@ -929,10 +929,11 @@ This class has four arguments:
 
 and five methods:
 
-- `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
+- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing BPE tokenization.
 - `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
 - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
 - `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
+- `encode(text)`: convert a `str` in a list of `int` tokens by performing BPE encoding.
 - `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
@@ -958,6 +959,10 @@ This class has three arguments:
 
 and two methods:
 
+- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing byte-level BPE.
+- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
+- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
+- `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
 - `encode(text)`: convert a `str` in a list of `int` tokens by performing byte-level BPE.
 - `decode(tokens)`: convert back a list of `int` tokens in a `str`.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index ab80876ee5..491db616e4 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -16,6 +16,7 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
+import sys
 import json
 import logging
 import os
@@ -138,7 +139,7 @@ class GPT2Tokenizer(object):
         tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
         return tokenizer
 
-    def __init__(self, vocab_file, merges_file, errors='replace', max_len=None):
+    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
         self.max_len = max_len if max_len is not None else int(1e12)
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
@@ -153,8 +154,25 @@ class GPT2Tokenizer(object):
         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
     def __len__(self):
-        return len(self.encoder)
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
 
     def bpe(self, token):
         if token in self.cache:
@@ -197,6 +215,54 @@ class GPT2Tokenizer(object):
         self.cache[token] = word
         return word
 
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary and merge files to a directory."""
         if not os.path.isdir(vocab_path):
@@ -220,26 +286,14 @@ class GPT2Tokenizer(object):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
+        index = len(self.encoder)
         with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]):
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
                 writer.write(token + u'\n')
+                index += 1
 
         return vocab_file, merge_file, special_tokens_file
-
-    def encode(self, text):
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
-        if len(bpe_tokens) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
-            )
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        return text
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 7a10271175..1088b5222b 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -150,6 +150,8 @@ class OpenAIGPTTokenizer(object):
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
         self.set_special_tokens(special_tokens)
 
     def __len__(self):
@@ -261,7 +263,10 @@ class OpenAIGPTTokenizer(object):
                 tokens.append(self.decoder[i])
         return tokens
 
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
         """Converts a sequence of ids in a string."""
         tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
@@ -296,8 +301,14 @@ class OpenAIGPTTokenizer(object):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
+        index = len(self.encoder)
         with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token in sorted(self.special_tokens.keys(), key=lambda kv: kv[1]):
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
                 writer.write(token + u'\n')
+                index += 1
 
         return vocab_file, merge_file, special_tokens_file
diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py
new file mode 100644
index 0000000000..29633bc17c
--- /dev/null
+++ b/tests/tokenization_gpt2_test.py
@@ -0,0 +1,68 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+
+from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
+
+
+class GPT2TokenizationTest(unittest.TestCase):
+
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "lo", "low", "er",
+                 "low", "lowest", "newer", "wider"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
+        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
+            json.dump(vocab_tokens, fp)
+            vocab_file = fp.name
+        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
+            fp.write("\n".join(merges))
+            merges_file = fp.name
+
+        tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        os.remove(vocab_file)
+        os.remove(merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [13, 12, 16]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
+        tokenizer_2 = GPT2Tokenizer.from_pretrained("/tmp/")
+        os.remove(vocab_file)
+        os.remove(merges_file)
+        os.remove(special_tokens_file)
+
+        self.assertListEqual(
+            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
+             tokenizer.special_tokens, tokenizer.special_tokens_decoder],
+            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
+             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
index 1f695cfb12..fb42cdd8cb 100644
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -38,7 +38,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
             fp.write("\n".join(merges))
             merges_file = fp.name
 
-        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
+        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
         os.remove(vocab_file)
         os.remove(merges_file)
 
@@ -53,19 +53,16 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
         vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer.from_pretrained("/tmp/")
+        tokenizer_2 = OpenAIGPTTokenizer.from_pretrained("/tmp/")
         os.remove(vocab_file)
         os.remove(merges_file)
+        os.remove(special_tokens_file)
 
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
         self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
+             tokenizer.special_tokens, tokenizer.special_tokens_decoder],
+            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
+             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
 
 
 if __name__ == '__main__':

From bdaba1897c14e0243d7fb58ddf5061957c70eea6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Apr 2019 17:44:06 +0200
Subject: [PATCH 35/47] updating GPT tokenization

---
 pytorch_pretrained_bert/tokenization_openai.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 1088b5222b..214a476ce9 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -273,9 +273,8 @@ class OpenAIGPTTokenizer(object):
         if clean_up_tokenization_spaces:
             out_string = out_string.replace('<unk>', '')
             out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
-                    ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
-                    ).replace(" 've", "'ve")
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
         return out_string
 
     def save_vocabulary(self, vocab_path):

From 07154dadb4fc5ce47e3d82dd33debb8e588039bd Mon Sep 17 00:00:00 2001
From: Abhi Sharma <18308855+SudoSharma@users.noreply.github.com>
Date: Tue, 16 Apr 2019 11:11:49 -0700
Subject: [PATCH 36/47] Fix indentation for unconditional generation

---
 examples/run_gpt2.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 61fbf9f323..9ac2b31961 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -107,25 +107,25 @@ def run_model():
                     print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                     print(text)
             print("=" * 80)
-    if args.unconditional:
-        generated = 0
-        for _ in range(args.nsamples // args.batch_size):
-            out = sample_sequence(
-                model=model, length=args.length,
-                context=None,
-                start_token=enc.encoder['<|endoftext|>'],
-                batch_size=args.batch_size,
-                temperature=args.temperature, top_k=args.top_k, device=device
-            )
-            out = out[:,1:].tolist()
-            for i in range(args.batch_size):
-                generated += 1
-                text = enc.decode(out[i])
-                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                print(text)
-        print("=" * 80)
-        if args.unconditional:
-            break
+      if args.unconditional:
+          generated = 0
+          for _ in range(args.nsamples // args.batch_size):
+              out = sample_sequence(
+                  model=model, length=args.length,
+                  context=None,
+                  start_token=enc.encoder['<|endoftext|>'],
+                  batch_size=args.batch_size,
+                  temperature=args.temperature, top_k=args.top_k, device=device
+              )
+              out = out[:,1:].tolist()
+              for i in range(args.batch_size):
+                  generated += 1
+                  text = enc.decode(out[i])
+                  print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                  print(text)
+          print("=" * 80)
+          if args.unconditional:
+              break
 
 if __name__ == '__main__':
     run_model()

From 9e666aaa297a84f8276cd891cd1a151e5266349e Mon Sep 17 00:00:00 2001
From: Abhi Sharma <18308855+SudoSharma@users.noreply.github.com>
Date: Tue, 16 Apr 2019 11:42:34 -0700
Subject: [PATCH 37/47] Fix gradient overflow issue during attention mask

This fix is in reference to issue #382. GPT2 can now be trained in mixed precision, which I've confirmed with testing. I also tested unconditional generation on multiple seeds before and after changing 1e10 to 1e4 and there was no difference. Please let me know if there is anything else I can do to make this pull request better. Thanks for all your work!
---
 pytorch_pretrained_bert/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 7cf1e6b59d..063c525d98 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -218,7 +218,7 @@ class Attention(nn.Module):
             w = w / math.sqrt(v.size(-1))
         nd, ns = w.size(-2), w.size(-1)
         b = self.bias[:, :, ns-nd:ns, :ns]
-        w = w * b - 1e10 * (1 - b)
+        w = w * b - 1e4 * (1 - b)
 
         w = nn.Softmax(dim=-1)(w)
         return torch.matmul(w, v)

From 87677fcc4dfda7ee9e0b5609344b46d6e3ccd227 Mon Sep 17 00:00:00 2001
From: Ben Mann <8enmann@gmail.com>
Date: Tue, 16 Apr 2019 15:23:21 -0700
Subject: [PATCH 38/47] [run_gpt2.py] temperature should be a float, not int

---
 examples/run_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 61fbf9f323..4b081d3a1d 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -58,7 +58,7 @@ def run_model():
     parser.add_argument("--nsamples", type=int, default=1)
     parser.add_argument("--batch_size", type=int, default=-1)
     parser.add_argument("--length", type=int, default=-1)
-    parser.add_argument("--temperature", type=int, default=1)
+    parser.add_argument("--temperature", type=float, default=1.0)
     parser.add_argument("--top_k", type=int, default=0)
     parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
     args = parser.parse_args()

From bc70779bf0dc7a1b59eeb65d106d1116feb4a828 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 10:56:15 +0200
Subject: [PATCH 39/47] fixed GPT-2 tokenization on python 2

---
 pytorch_pretrained_bert/file_utils.py        | 2 +-
 pytorch_pretrained_bert/tokenization_gpt2.py | 5 +++--
 tests/tokenization_gpt2_test.py              | 3 ++-
 tests/tokenization_openai_test.py            | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 6de7e259e5..6a24b099e1 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -227,7 +227,7 @@ def get_from_cache(url, cache_dir=None):
             meta = {'url': url, 'etag': etag}
             meta_path = cache_path + '.json'
             with open(meta_path, 'w', encoding="utf-8") as meta_file:
-                json.dump(meta, meta_file)
+                meta_file.write(json.dumps(meta))
 
             logger.info("removing temp file %s", temp_file.name)
 
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 491db616e4..0e91498f22 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -59,6 +59,7 @@ def bytes_to_unicode():
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
+    _chr = unichr if sys.version_info[0] == 2 else chr
     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
     cs = bs[:]
     n = 0
@@ -67,7 +68,7 @@ def bytes_to_unicode():
             bs.append(b)
             cs.append(2**8+n)
             n += 1
-    cs = [chr(n) for n in cs]
+    cs = [_chr(n) for n in cs]
     return dict(zip(bs, cs))
 
 def get_pairs(word):
@@ -219,7 +220,7 @@ class GPT2Tokenizer(object):
         """ Tokenize a string. """
         bpe_tokens = []
         for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8'))
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 
diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py
index 29633bc17c..0773574360 100644
--- a/tests/tokenization_gpt2_test.py
+++ b/tests/tokenization_gpt2_test.py
@@ -31,13 +31,14 @@ class GPT2TokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
         with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            json.dump(vocab_tokens, fp)
+            fp.write(json.dumps(vocab_tokens))
             vocab_file = fp.name
         with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
             fp.write("\n".join(merges))
             merges_file = fp.name
 
         tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        print("encoder", tokenizer.byte_encoder)
         os.remove(vocab_file)
         os.remove(merges_file)
 
diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
index fb42cdd8cb..2011ccc1df 100644
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -32,7 +32,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
         with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            json.dump(vocab_tokens, fp)
+            fp.write(json.dumps(vocab_tokens))
             vocab_file = fp.name
         with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
             fp.write("\n".join(merges))

From 5afa497cbfc53c679a9b22997b6312fad57ee2f8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 11:04:41 +0200
Subject: [PATCH 40/47] fix GPT-2 tokenization to work also on python 3...

---
 pytorch_pretrained_bert/tokenization_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 0e91498f22..80be4435df 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -220,7 +220,7 @@ class GPT2Tokenizer(object):
         """ Tokenize a string. """
         bpe_tokens = []
         for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[ord(b)] for b in token.encode('utf-8'))
+            token = ''.join(self.byte_encoder[ord(b)] for b in token)
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 

From 31d387604c67d738740a9ae9350df0a273802966 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 11:58:27 +0200
Subject: [PATCH 41/47] adding s3 model tests with --runslow

---
 .circleci/config.yml                  |  4 ++--
 tests/conftest.py                     | 19 +++++++++++++++++++
 tests/modeling_gpt2_test.py           | 12 +++++++++++-
 tests/modeling_openai_test.py         | 12 +++++++++++-
 tests/modeling_test.py                | 11 +++++++++++
 tests/modeling_transfo_xl_test.py     | 12 +++++++++++-
 tests/tokenization_gpt2_test.py       | 11 ++++++++++-
 tests/tokenization_openai_test.py     | 12 +++++++++++-
 tests/tokenization_test.py            | 11 ++++++++++-
 tests/tokenization_transfo_xl_test.py | 11 ++++++++++-
 10 files changed, 106 insertions(+), 9 deletions(-)
 create mode 100644 tests/conftest.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b57b478030..7296e07ca3 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -9,7 +9,7 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest ftfy spacy
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/
+            - run: python -m pytest -sv tests/ --runslow
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
         docker:
@@ -20,7 +20,7 @@ jobs:
             - run: sudo pip install pytest spacy
             - run: sudo pip install ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/
+            - run: python -m pytest -sv tests/ --runslow
 workflows:
   version: 2
   build_and_test:
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000..841ebc8df9
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,19 @@
+# content of conftest.py
+
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--runslow", action="store_true", default=False, help="run slow tests"
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--runslow"):
+        # --runslow given in cli: do not skip slow tests
+        return
+    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+    for item in items:
+        if "slow" in item.keywords:
+            item.add_marker(skip_slow)
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
index d542422060..8f4581b37f 100644
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@@ -20,12 +20,14 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest
 
 import torch
 
 from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
-
+from pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP
 
 class GPT2ModelTest(unittest.TestCase):
     class GPT2ModelTester(object):
@@ -185,6 +187,14 @@ class GPT2ModelTest(unittest.TestCase):
         os.remove(json_file_path)
         self.assertEqual(config_second.to_dict(), config_first.to_dict())
 
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_gpt2_model(*config_and_inputs)
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index db03bf792e..4e7d9d542b 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -20,12 +20,14 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest
 
 import torch
 
 from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-
+from pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP
 
 class OpenAIGPTModelTest(unittest.TestCase):
     class OpenAIGPTModelTester(object):
@@ -197,6 +199,14 @@ class OpenAIGPTModelTest(unittest.TestCase):
         os.remove(json_file_path)
         self.assertEqual(config_second.to_dict(), config_first.to_dict())
 
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_openai_model(*config_and_inputs)
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index 02d7a13fda..5cde383fdf 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -20,6 +20,8 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest
 
 import torch
 
@@ -27,6 +29,7 @@ from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
                                      BertForTokenClassification)
+from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 class BertModelTest(unittest.TestCase):
@@ -260,6 +263,14 @@ class BertModelTest(unittest.TestCase):
         os.remove(json_file_path)
         self.assertEqual(config_second.to_dict(), config_first.to_dict())
 
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_bert_model(*config_and_inputs)
diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py
index a59d90b205..e5c5f3d163 100644
--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
@@ -20,11 +20,13 @@ import os
 import unittest
 import json
 import random
+import shutil
+import pytest
 
 import torch
 
 from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-
+from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
 
 class TransfoXLModelTest(unittest.TestCase):
     class TransfoXLModelTester(object):
@@ -195,6 +197,14 @@ class TransfoXLModelTest(unittest.TestCase):
         os.remove(json_file_path)
         self.assertEqual(config_second.to_dict(), config_first.to_dict())
 
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
 
diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py
index 0773574360..870f61ca79 100644
--- a/tests/tokenization_gpt2_test.py
+++ b/tests/tokenization_gpt2_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
+import shutil
+import pytest
 
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 
 class GPT2TokenizationTest(unittest.TestCase):
@@ -64,6 +66,13 @@ class GPT2TokenizationTest(unittest.TestCase):
             [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
              tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
 
+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
index 2011ccc1df..a57f86be57 100644
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
+import shutil
+import pytest
 
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 
 class OpenAIGPTTokenizationTest(unittest.TestCase):
@@ -64,6 +66,14 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
             [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
              tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
 
+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py
index 15cc7ccd82..fe120a522c 100644
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -17,12 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
+import shutil
+import pytest
 
 from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                   BertTokenizer,
                                                   WordpieceTokenizer,
                                                   _is_control, _is_punctuation,
-                                                  _is_whitespace)
+                                                  _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)
 
 
 class TokenizationTest(unittest.TestCase):
@@ -56,6 +58,13 @@ class TokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py
index 1a805f11e6..bf0ac5db2f 100644
--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
+import shutil
+import pytest
 
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 
 class TransfoXLTokenizationTest(unittest.TestCase):
@@ -66,6 +68,13 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 
+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)
 
 if __name__ == '__main__':
     unittest.main()

From 929579f3b5198185a5dd7f09eeee646b5f294398 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 12:35:08 +0200
Subject: [PATCH 42/47] fix #497

---
 pytorch_pretrained_bert/tokenization_gpt2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 80be4435df..07777292a3 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -97,6 +97,7 @@ class GPT2Tokenizer(object):
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
             merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
         else:
             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
             merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)

From bcde2c61cb2a9b4b5c2b2234e2e8cee505e695e8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 12:35:38 +0200
Subject: [PATCH 43/47] fix #497

---
 pytorch_pretrained_bert/tokenization_gpt2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 80be4435df..07777292a3 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -97,6 +97,7 @@ class GPT2Tokenizer(object):
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
             merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
         else:
             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
             merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)

From fa765202402499486efd1cb3484c5e70555479c2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 13:32:22 +0200
Subject: [PATCH 44/47] fix file_utils on python 2

---
 pytorch_pretrained_bert/file_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 6a24b099e1..6de7e259e5 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -227,7 +227,7 @@ def get_from_cache(url, cache_dir=None):
             meta = {'url': url, 'etag': etag}
             meta_path = cache_path + '.json'
             with open(meta_path, 'w', encoding="utf-8") as meta_file:
-                meta_file.write(json.dumps(meta))
+                json.dump(meta, meta_file)
 
             logger.info("removing temp file %s", temp_file.name)
 

From 265550ec34bfa756538c60e0d5d4c906ee78e1ce Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 14:22:35 +0200
Subject: [PATCH 45/47] relax network connection requirements

---
 pytorch_pretrained_bert/file_utils.py | 29 ++++++++++++++++++++-------
 tests/tokenization_gpt2_test.py       |  2 +-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 6de7e259e5..e7e1714f97 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -5,11 +5,13 @@ Copyright by the AllenNLP authors.
 """
 from __future__ import (absolute_import, division, print_function, unicode_literals)
 
+import sys
 import json
 import logging
 import os
 import shutil
 import tempfile
+import fnmatch
 from functools import wraps
 from hashlib import sha256
 import sys
@@ -191,17 +193,30 @@ def get_from_cache(url, cache_dir=None):
     if url.startswith("s3://"):
         etag = s3_etag(url)
     else:
-        response = requests.head(url, allow_redirects=True)
-        if response.status_code != 200:
-            raise IOError("HEAD request failed for url {} with status code {}"
-                          .format(url, response.status_code))
-        etag = response.headers.get("ETag")
+        try:
+            response = requests.head(url, allow_redirects=True)
+            if response.status_code != 200:
+                etag = None
+            else:
+                etag = response.headers.get("ETag")
+        except EnvironmentError:
+            etag = None
 
+    if sys.version_info[0] == 2 and etag is not None:
+        etag = etag.decode('utf-8')
     filename = url_to_filename(url, etag)
 
     # get cache path to put the file
     cache_path = os.path.join(cache_dir, filename)
 
+    # If we don't have a connection (etag is None) and can't identify the file
+    # try to get the last downloaded one
+    if not os.path.exists(cache_path) and etag is None:
+        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
+        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
+        if matching_files:
+            cache_path = os.path.join(cache_dir, matching_files[-1])
+
     if not os.path.exists(cache_path):
         # Download to temporary file, then copy to cache dir once finished.
         # Otherwise you get corrupt cache entries if the download gets interrupted.
@@ -226,8 +241,8 @@ def get_from_cache(url, cache_dir=None):
             logger.info("creating metadata file for %s", cache_path)
             meta = {'url': url, 'etag': etag}
             meta_path = cache_path + '.json'
-            with open(meta_path, 'w', encoding="utf-8") as meta_file:
-                json.dump(meta, meta_file)
+            with open(meta_path, 'w') as meta_file:
+                meta_file.write(json.dumps(meta, indent=4))
 
             logger.info("removing temp file %s", temp_file.name)
 
diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py
index 870f61ca79..cfd13de391 100644
--- a/tests/tokenization_gpt2_test.py
+++ b/tests/tokenization_gpt2_test.py
@@ -66,7 +66,7 @@ class GPT2TokenizationTest(unittest.TestCase):
             [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
              tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
 
-    @pytest.mark.slow
+    # @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
         cache_dir = "/tmp/pytorch_pretrained_bert_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:

From 23d4554ec05d6cf5b35960052de8f324b7e0ec86 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 14:48:34 +0200
Subject: [PATCH 46/47] is python 2 happy now

---
 pytorch_pretrained_bert/file_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index e7e1714f97..17bdd258ea 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -242,7 +242,10 @@ def get_from_cache(url, cache_dir=None):
             meta = {'url': url, 'etag': etag}
             meta_path = cache_path + '.json'
             with open(meta_path, 'w') as meta_file:
-                meta_file.write(json.dumps(meta, indent=4))
+                output_string = json.dumps(meta)
+                if sys.version_info[0] == 2 and isinstance(output_string, str):
+                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                meta_file.write(output_string)
 
             logger.info("removing temp file %s", temp_file.name)
 

From 34ae5bf8385cce8f792c803fa288eccf472003ff Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 17 Apr 2019 14:52:12 +0200
Subject: [PATCH 47/47] small clean up in tests

---
 tests/tokenization_gpt2_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/tokenization_gpt2_test.py b/tests/tokenization_gpt2_test.py
index cfd13de391..4ae804a060 100644
--- a/tests/tokenization_gpt2_test.py
+++ b/tests/tokenization_gpt2_test.py
@@ -40,7 +40,6 @@ class GPT2TokenizationTest(unittest.TestCase):
             merges_file = fp.name
 
         tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        print("encoder", tokenizer.byte_encoder)
         os.remove(vocab_file)
         os.remove(merges_file)