diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py index 546c11b528..bddfcc4e0a 100644 --- a/examples/run_openai_gpt.py +++ b/examples/run_openai_gpt.py @@ -163,7 +163,7 @@ def main(): datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) - # Compute the mex input length for the Transformer + # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index 4ea8de6f70..21c5299356 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -120,7 +120,7 @@ class BertTokenizer(object): for token in tokens: ids.append(self.vocab[token]) if len(ids) > self.max_len: - raise ValueError( + logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py index 96b0ece7f0..257db6e61e 100644 --- a/pytorch_pretrained_bert/tokenization_gpt2.py +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -193,7 +193,7 @@ class GPT2Tokenizer(object): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) if len(bpe_tokens) > self.max_len: - raise ValueError( + logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this OpenAI GPT-2 model ({} > {}). Running this" " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len) diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index 24ca1ab596..1665b3f900 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -232,7 +232,7 @@ class OpenAIGPTTokenizer(object): else: ids.append(self.encoder.get(token, 0)) if len(ids) > self.max_len: - raise ValueError( + logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this OpenAI GPT model ({} > {}). Running this" " sequence through the model will result in indexing errors".format(len(ids), self.max_len)