Merge pull request #337 from CatalinVoss/patch-2

Allow tokenization of sequences > 512 for caching
2019-03-06 09:45:49 +01:00
parent 7b9e5a54b5 4a49c22584
commit 477ec4b6cc
4 changed files with 4 additions and 4 deletions
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -120,7 +120,7 @@ class BertTokenizer(object):
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
-            raise ValueError(
+            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)