[tokenizers] Several small improvements and bug fixes (#5287)
* avoid recursion in id checks for fast tokenizers * better typings and fix #5232 * align slow and fast tokenizers behaviors for Roberta and GPT2 * style and quality * fix tests - improve typings
This commit is contained in:
@@ -54,9 +54,10 @@ class CommonFastTokenizerTest(unittest.TestCase):
|
||||
if tok_case.filter is None or (
|
||||
tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name)
|
||||
):
|
||||
kwargs = dict(t for t in tok_case.kwargs) if tok_case.kwargs else {}
|
||||
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
|
||||
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name)
|
||||
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name)
|
||||
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
self.fast_align_python(tokenizer_r, tokenizer_p, tok_case, pretrained_name)
|
||||
self.fast_only(tokenizer_r)
|
||||
@@ -767,7 +768,16 @@ class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
|
||||
|
||||
class RobertaFastTokenizerTest(CommonFastTokenizerTest):
|
||||
TOKENIZERS_CLASSES = frozenset(
|
||||
[Tokenizer("Roberta", RobertaTokenizerFast, RobertaTokenizer, "vocab_file", filter_roberta_detectors, None)]
|
||||
[
|
||||
Tokenizer(
|
||||
"Roberta",
|
||||
RobertaTokenizerFast,
|
||||
RobertaTokenizer,
|
||||
"vocab_file",
|
||||
filter_roberta_detectors,
|
||||
(("cls_token", "<s>"),),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def assert_embeded_special_tokens(self, tokenizer_r, tokenizer_p):
|
||||
|
||||
@@ -18,7 +18,7 @@ import json
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer, RobertaTokenizerFast
|
||||
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, AddedToken, RobertaTokenizer, RobertaTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
from .utils import slow
|
||||
@@ -139,7 +139,9 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Testing spaces after special tokenss
|
||||
mask = "<mask>"
|
||||
tokenizer.add_special_tokens({"mask_token": mask})
|
||||
tokenizer.add_special_tokens(
|
||||
{"mask_token": AddedToken(mask, lstrip=True, rstrip=False)}
|
||||
) # mask token has a left space
|
||||
mask_ind = tokenizer.convert_tokens_to_ids(mask)
|
||||
|
||||
sequence = "Encode <mask> sequence"
|
||||
|
||||
Reference in New Issue
Block a user