[tokenizers] Several small improvements and bug fixes (#5287)

* avoid recursion in id checks for fast tokenizers

* better typings and fix #5232

* align slow and fast tokenizers behaviors for Roberta and GPT2

* style and quality

* fix tests - improve typings
This commit is contained in:
Thomas Wolf
2020-06-25 22:17:14 +02:00
committed by GitHub
parent 24f46ea3f3
commit 315f464b0a
6 changed files with 64 additions and 36 deletions

View File

@@ -18,7 +18,7 @@ import json
import os
import unittest
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer, RobertaTokenizerFast
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, AddedToken, RobertaTokenizer, RobertaTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow
@@ -139,7 +139,9 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Testing spaces after special tokenss
mask = "<mask>"
tokenizer.add_special_tokens({"mask_token": mask})
tokenizer.add_special_tokens(
{"mask_token": AddedToken(mask, lstrip=True, rstrip=False)}
) # mask token has a left space
mask_ind = tokenizer.convert_tokens_to_ids(mask)
sequence = "Encode <mask> sequence"