[HUGE] Refactoring tokenizers backend - padding - truncation - pre-tokenized pipeline - fast tokenizers - tests (#4510)

* Use tokenizers pre-tokenized pipeline

* failing pretrokenized test

* Fix is_pretokenized in python

* add pretokenized tests

* style and quality

* better tests for batched pretokenized inputs

* tokenizers clean up - new padding_strategy - split the files

* [HUGE] refactoring tokenizers - padding - truncation - tests

* style and quality

* bump up requied tokenizers version to 0.8.0-rc1

* switched padding/truncation API - simpler better backward compat

* updating tests for custom tokenizers

* style and quality - tests on pad

* fix QA pipeline

* fix backward compatibility for max_length only

* style and quality

* Various cleans up - add verbose

* fix tests

* update docstrings

* Fix tests

* Docs reformatted

* __call__ method documented

Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
Anthony MOI
2020-06-15 17:12:51 -04:00
committed by GitHub
parent ebba39e4e1
commit 36434220fc
25 changed files with 3821 additions and 2680 deletions

View File

@@ -44,6 +44,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
@@ -62,7 +64,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def get_rust_tokenizer(self, **kwargs):
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self):
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running"
return input_text, output_text
@@ -72,7 +74,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokens = tokenizer.tokenize("UNwant\u00E9d,running")
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
@@ -96,6 +98,25 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
# With lower casing
tokenizer = self.get_tokenizer(do_lower_case=True)
rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
sequence = "UNwant\u00E9d,running"
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_chinese(self):
tokenizer = BasicTokenizer()