From 9775b2eb277c246f27dda08c6b1df29abee3da5b Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sat, 2 Mar 2019 16:30:21 -0800
Subject: [PATCH] Allow tokenization of sequences > 512 for caching

For many applications requiring randomized data access, it's easier to cache the tokenized representations than the words. So why not turn this into a warning?
---
 pytorch_pretrained_bert/tokenization_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 24ca1ab596..1665b3f900 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -232,7 +232,7 @@ class OpenAIGPTTokenizer(object):
             else:
                 ids.append(self.encoder.get(token, 0))
         if len(ids) > self.max_len:
-            raise ValueError(
+            logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
                 " sequence length for this OpenAI GPT model ({} > {}). Running this"
                 " sequence through the model will result in indexing errors".format(len(ids), self.max_len)