From 3aa37b945e21019575fc183ce750a37b86efc634 Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Thu, 1 Jul 2021 12:37:07 +0200 Subject: [PATCH] Add test for a WordLevel tokenizer model (#12437) * add a test for a WordLevel tokenizer * adapt common test to new tokenizer --- tests/test_tokenization_common.py | 14 +++------- tests/test_tokenization_fast.py | 43 +++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 4995e8dfd2..0a662cc62c 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -3168,11 +3168,8 @@ class TokenizerTesterMixin: decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) expected_result = "This is the first sentence" - # OpenAIGPT always lowercases and has no arg. - if new_tokenizer.init_kwargs.get("do_lower_case", False) or tokenizer.__class__.__name__.startswith( - "OpenAIGPT" - ): - expected_result = expected_result.lower() + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) self.assertEqual(expected_result, decoded_input) # We check that the parameters of the tokenizer remained the same @@ -3287,11 +3284,8 @@ class TokenizerTesterMixin: decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) expected_result = "This is the first sentence" - # OpenAIGPT always lowercases and has no arg. - if new_tokenizer.init_kwargs.get("do_lower_case", False) or tokenizer.__class__.__name__.startswith( - "OpenAIGPT" - ): - expected_result = expected_result.lower() + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) self.assertEqual(expected_result, decoded_input) diff --git a/tests/test_tokenization_fast.py b/tests/test_tokenization_fast.py index 796a3f07c2..30fc688270 100644 --- a/tests/test_tokenization_fast.py +++ b/tests/test_tokenization_fast.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shutil +import tempfile import unittest from transformers import PreTrainedTokenizerFast @@ -33,9 +35,12 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): super().setUp() self.test_rust_tokenizer = True - self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})] + model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"] - tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast") + # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment) + self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths] + + tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0]) tokenizer.save_pretrained(self.tmpdirname) def test_pretrained_model_lists(self): @@ -51,3 +56,37 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): def test_rust_tokenizer_signature(self): # PreTrainedTokenizerFast doesn't have tokenizer_file in its signature pass + + def test_training_new_tokenizer(self): + tmpdirname_orig = self.tmpdirname + # Here we want to test the 2 available tokenizers that use 2 different types of models: Unigram and WordLevel. + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + try: + self.tmpdirname = tempfile.mkdtemp() + tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tokenizer.save_pretrained(self.tmpdirname) + super().test_training_new_tokenizer() + finally: + # Even if the test fails, we must be sure that the folder is deleted and that the default tokenizer + # is restored + shutil.rmtree(self.tmpdirname) + self.tmpdirname = tmpdirname_orig + + def test_training_new_tokenizer_with_special_tokens_change(self): + tmpdirname_orig = self.tmpdirname + # Here we want to test the 2 available tokenizers that use 2 different types of models: Unigram and WordLevel. + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + try: + self.tmpdirname = tempfile.mkdtemp() + tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tokenizer.save_pretrained(self.tmpdirname) + super().test_training_new_tokenizer_with_special_tokens_change() + finally: + # Even if the test fails, we must be sure that the folder is deleted and that the default tokenizer + # is restored + shutil.rmtree(self.tmpdirname) + self.tmpdirname = tmpdirname_orig