[HUGE] Refactoring tokenizers backend - padding - truncation - pre-tokenized pipeline - fast tokenizers - tests (#4510)
* Use tokenizers pre-tokenized pipeline * failing pretrokenized test * Fix is_pretokenized in python * add pretokenized tests * style and quality * better tests for batched pretokenized inputs * tokenizers clean up - new padding_strategy - split the files * [HUGE] refactoring tokenizers - padding - truncation - tests * style and quality * bump up requied tokenizers version to 0.8.0-rc1 * switched padding/truncation API - simpler better backward compat * updating tests for custom tokenizers * style and quality - tests on pad * fix QA pipeline * fix backward compatibility for max_length only * style and quality * Various cleans up - add verbose * fix tests * update docstrings * Fix tests * Docs reformatted * __call__ method documented Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
@@ -60,11 +60,26 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
def get_input_output_texts(self):
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
input_text = "こんにちは、世界。 \nこんばんは、世界。"
|
||||
output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
|
||||
return input_text, output_text
|
||||
|
||||
def get_clean_sequence(self, tokenizer):
|
||||
input_text, output_text = self.get_input_output_texts(tokenizer)
|
||||
ids = tokenizer.encode(output_text, add_special_tokens=False)
|
||||
text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
|
||||
return text, ids
|
||||
|
||||
def test_pretokenized_inputs(self):
|
||||
pass # TODO add if relevant
|
||||
|
||||
def test_maximum_encoding_length_pair_input(self):
|
||||
pass # TODO add if relevant
|
||||
|
||||
def test_maximum_encoding_length_single_input(self):
|
||||
pass # TODO add if relevant
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = self.tokenizer_class(self.vocab_file)
|
||||
|
||||
@@ -157,11 +172,20 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
input_text = "こんにちは、世界。 \nこんばんは、世界。"
|
||||
output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
|
||||
return input_text, output_text
|
||||
|
||||
def test_pretokenized_inputs(self):
|
||||
pass # TODO add if relevant
|
||||
|
||||
def test_maximum_encoding_length_pair_input(self):
|
||||
pass # TODO add if relevant
|
||||
|
||||
def test_maximum_encoding_length_single_input(self):
|
||||
pass # TODO add if relevant
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user