Add test for a WordLevel tokenizer model (#12437)
* add a test for a WordLevel tokenizer * adapt common test to new tokenizer
This commit is contained in:
@@ -3168,11 +3168,8 @@ class TokenizerTesterMixin:
|
|||||||
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
|
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
|
||||||
expected_result = "This is the first sentence"
|
expected_result = "This is the first sentence"
|
||||||
|
|
||||||
# OpenAIGPT always lowercases and has no arg.
|
if tokenizer.backend_tokenizer.normalizer is not None:
|
||||||
if new_tokenizer.init_kwargs.get("do_lower_case", False) or tokenizer.__class__.__name__.startswith(
|
expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
|
||||||
"OpenAIGPT"
|
|
||||||
):
|
|
||||||
expected_result = expected_result.lower()
|
|
||||||
self.assertEqual(expected_result, decoded_input)
|
self.assertEqual(expected_result, decoded_input)
|
||||||
|
|
||||||
# We check that the parameters of the tokenizer remained the same
|
# We check that the parameters of the tokenizer remained the same
|
||||||
@@ -3287,11 +3284,8 @@ class TokenizerTesterMixin:
|
|||||||
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
|
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
|
||||||
expected_result = "This is the first sentence"
|
expected_result = "This is the first sentence"
|
||||||
|
|
||||||
# OpenAIGPT always lowercases and has no arg.
|
if tokenizer.backend_tokenizer.normalizer is not None:
|
||||||
if new_tokenizer.init_kwargs.get("do_lower_case", False) or tokenizer.__class__.__name__.startswith(
|
expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
|
||||||
"OpenAIGPT"
|
|
||||||
):
|
|
||||||
expected_result = expected_result.lower()
|
|
||||||
self.assertEqual(expected_result, decoded_input)
|
self.assertEqual(expected_result, decoded_input)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerFast
|
from transformers import PreTrainedTokenizerFast
|
||||||
@@ -33,9 +35,12 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
super().setUp()
|
super().setUp()
|
||||||
self.test_rust_tokenizer = True
|
self.test_rust_tokenizer = True
|
||||||
|
|
||||||
self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})]
|
model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
|
||||||
|
|
||||||
tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast")
|
# Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
|
||||||
|
self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
|
||||||
|
|
||||||
|
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
def test_pretrained_model_lists(self):
|
def test_pretrained_model_lists(self):
|
||||||
@@ -51,3 +56,37 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_rust_tokenizer_signature(self):
|
def test_rust_tokenizer_signature(self):
|
||||||
# PreTrainedTokenizerFast doesn't have tokenizer_file in its signature
|
# PreTrainedTokenizerFast doesn't have tokenizer_file in its signature
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def test_training_new_tokenizer(self):
|
||||||
|
tmpdirname_orig = self.tmpdirname
|
||||||
|
# Here we want to test the 2 available tokenizers that use 2 different types of models: Unigram and WordLevel.
|
||||||
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
|
try:
|
||||||
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
|
tokenizer.save_pretrained(self.tmpdirname)
|
||||||
|
super().test_training_new_tokenizer()
|
||||||
|
finally:
|
||||||
|
# Even if the test fails, we must be sure that the folder is deleted and that the default tokenizer
|
||||||
|
# is restored
|
||||||
|
shutil.rmtree(self.tmpdirname)
|
||||||
|
self.tmpdirname = tmpdirname_orig
|
||||||
|
|
||||||
|
def test_training_new_tokenizer_with_special_tokens_change(self):
|
||||||
|
tmpdirname_orig = self.tmpdirname
|
||||||
|
# Here we want to test the 2 available tokenizers that use 2 different types of models: Unigram and WordLevel.
|
||||||
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
|
try:
|
||||||
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
|
tokenizer.save_pretrained(self.tmpdirname)
|
||||||
|
super().test_training_new_tokenizer_with_special_tokens_change()
|
||||||
|
finally:
|
||||||
|
# Even if the test fails, we must be sure that the folder is deleted and that the default tokenizer
|
||||||
|
# is restored
|
||||||
|
shutil.rmtree(self.tmpdirname)
|
||||||
|
self.tmpdirname = tmpdirname_orig
|
||||||
|
|||||||
Reference in New Issue
Block a user