From c0cf0a04d5c3eb21848c750398a310273b5a783d Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Wed, 27 Feb 2019 18:01:06 -0800
Subject: [PATCH 1/3] Fix typo

---
 examples/run_openai_gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 546c11b528..bddfcc4e0a 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -163,7 +163,7 @@ def main():
     datasets = (train_dataset, eval_dataset)
     encoded_datasets = tokenize_and_encode(datasets)
 
-    # Compute the mex input length for the Transformer
+    # Compute the max input length for the Transformer
     max_length = model.config.n_positions // 2 - 2
     input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                            for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)

From 9775b2eb277c246f27dda08c6b1df29abee3da5b Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sat, 2 Mar 2019 16:30:21 -0800
Subject: [PATCH 2/3] Allow tokenization of sequences > 512 for caching

For many applications requiring randomized data access, it's easier to cache the tokenized representations than the words. So why not turn this into a warning?
---
 pytorch_pretrained_bert/tokenization_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 24ca1ab596..1665b3f900 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -232,7 +232,7 @@ class OpenAIGPTTokenizer(object):
             else:
                 ids.append(self.encoder.get(token, 0))
         if len(ids) > self.max_len:
-            raise ValueError(
+            logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
                 " sequence length for this OpenAI GPT model ({} > {}). Running this"
                 " sequence through the model will result in indexing errors".format(len(ids), self.max_len)

From 4a49c2258406b0249d430d197dc21e14a2504b8e Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Tue, 5 Mar 2019 12:31:45 -0800
Subject: [PATCH 3/3] Warn instead of raising in BERT and GPT-2 tokenizers as
 well, to allow for pre-caching of tokens

---
 pytorch_pretrained_bert/tokenization.py      | 2 +-
 pytorch_pretrained_bert/tokenization_gpt2.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 1fabea852a..605af8e0b9 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -101,7 +101,7 @@ class BertTokenizer(object):
         for token in tokens:
             ids.append(self.vocab[token])
         if len(ids) > self.max_len:
-            raise ValueError(
+            logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
                 " sequence length for this BERT model ({} > {}). Running this"
                 " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 96b0ece7f0..257db6e61e 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -193,7 +193,7 @@ class GPT2Tokenizer(object):
             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
         if len(bpe_tokens) > self.max_len:
-            raise ValueError(
+            logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
                 " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
                 " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)