[HUGE] Refactoring tokenizers backend - padding - truncation - pre-tokenized pipeline - fast tokenizers - tests (#4510)
* Use tokenizers pre-tokenized pipeline * failing pretrokenized test * Fix is_pretokenized in python * add pretokenized tests * style and quality * better tests for batched pretokenized inputs * tokenizers clean up - new padding_strategy - split the files * [HUGE] refactoring tokenizers - padding - truncation - tests * style and quality * bump up requied tokenizers version to 0.8.0-rc1 * switched padding/truncation API - simpler better backward compat * updating tests for custom tokenizers * style and quality - tests on pad * fix QA pipeline * fix backward compatibility for max_length only * style and quality * Various cleans up - add verbose * fix tests * update docstrings * Fix tests * Docs reformatted * __call__ method documented Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
@@ -44,6 +44,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
"[UNK]",
|
||||
"[CLS]",
|
||||
"[SEP]",
|
||||
"[PAD]",
|
||||
"[MASK]",
|
||||
"want",
|
||||
"##want",
|
||||
"##ed",
|
||||
@@ -62,7 +64,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def get_rust_tokenizer(self, **kwargs):
|
||||
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
input_text = "UNwant\u00E9d,running"
|
||||
output_text = "unwanted, running"
|
||||
return input_text, output_text
|
||||
@@ -72,7 +74,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokens = tokenizer.tokenize("UNwant\u00E9d,running")
|
||||
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
@@ -96,6 +98,25 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
# With lower casing
|
||||
tokenizer = self.get_tokenizer(do_lower_case=True)
|
||||
rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
|
||||
|
||||
sequence = "UNwant\u00E9d,running"
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_chinese(self):
|
||||
tokenizer = BasicTokenizer()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user