From 4a49c2258406b0249d430d197dc21e14a2504b8e Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Tue, 5 Mar 2019 12:31:45 -0800 Subject: [PATCH] Warn instead of raising in BERT and GPT-2 tokenizers as well, to allow for pre-caching of tokens --- pytorch_pretrained_bert/tokenization.py | 2 +- pytorch_pretrained_bert/tokenization_gpt2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index 1fabea852a..605af8e0b9 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -101,7 +101,7 @@ class BertTokenizer(object): for token in tokens: ids.append(self.vocab[token]) if len(ids) > self.max_len: - raise ValueError( + logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 96b0ece7f0..257db6e61e 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -193,7 +193,7 @@ class GPT2Tokenizer(object): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) if len(bpe_tokens) > self.max_len: - raise ValueError( + logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this OpenAI GPT-2 model ({} > {}). Running this" " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)