Tokenizer fast warnings (#2922)
* Remove warning when pad_to_max_length is not set. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Move RoberTa warning to RoberTa and not GPT2 base tokenizer. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
This commit is contained in:
@@ -269,9 +269,3 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.warning(
|
|
||||||
"RobertaTokenizerFast has an issue when working on mask language modeling "
|
|
||||||
"where it introduces an extra encoded space before the mask token."
|
|
||||||
"See https://github.com/huggingface/transformers/pull/2778 for more information."
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -211,6 +211,12 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
|
|||||||
self.max_len_single_sentence = self.max_len - self.num_added_tokens(False) # take into account special tokens
|
self.max_len_single_sentence = self.max_len - self.num_added_tokens(False) # take into account special tokens
|
||||||
self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True) # take into account special tokens
|
self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True) # take into account special tokens
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
"RobertaTokenizerFast has an issue when working on mask language modeling "
|
||||||
|
"where it introduces an extra encoded space before the mask token."
|
||||||
|
"See https://github.com/huggingface/transformers/pull/2778 for more information."
|
||||||
|
)
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ def truncate_and_pad(
|
|||||||
pad_type_id=pad_token_type_id,
|
pad_type_id=pad_token_type_id,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
)
|
)
|
||||||
else:
|
elif pad_to_max_length:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
|
"Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
|
||||||
"To remove this error, you can add a new pad token and then resize model embedding:\n"
|
"To remove this error, you can add a new pad token and then resize model embedding:\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user