From e2a6445ebbc36121817c1f605d9a09a335f5fba5 Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Thu, 20 Feb 2020 17:55:03 +0100 Subject: [PATCH] Tokenizer fast warnings (#2922) * Remove warning when pad_to_max_length is not set. Signed-off-by: Morgan Funtowicz * Move RoberTa warning to RoberTa and not GPT2 base tokenizer. Signed-off-by: Morgan Funtowicz --- src/transformers/tokenization_gpt2.py | 6 ------ src/transformers/tokenization_roberta.py | 6 ++++++ src/transformers/tokenization_utils.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index 19f578631a..5e8d9c7728 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -269,9 +269,3 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): unk_token=unk_token, **kwargs, ) - - logger.warning( - "RobertaTokenizerFast has an issue when working on mask language modeling " - "where it introduces an extra encoded space before the mask token." - "See https://github.com/huggingface/transformers/pull/2778 for more information." - ) diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index ff2aa11004..fda82fb307 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -211,6 +211,12 @@ class RobertaTokenizerFast(GPT2TokenizerFast): self.max_len_single_sentence = self.max_len - self.num_added_tokens(False) # take into account special tokens self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True) # take into account special tokens + logger.warning( + "RobertaTokenizerFast has an issue when working on mask language modeling " + "where it introduces an extra encoded space before the mask token." + "See https://github.com/huggingface/transformers/pull/2778 for more information." + ) + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] if token_ids_1 is None: diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index e3af47b037..80ab188055 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -85,7 +85,7 @@ def truncate_and_pad( pad_type_id=pad_token_type_id, pad_token=pad_token, ) - else: + elif pad_to_max_length: logger.warning( "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n" "To remove this error, you can add a new pad token and then resize model embedding:\n"