From e09e54fd9db11075dd3cc6db6ede52916d3d5e15 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 24 Nov 2020 09:50:25 -0500 Subject: [PATCH] MT5 should have an autotokenizer (#8743) * MT5 should have an autotokenizer * Different configurations should be able to point to same tokenizers --- src/transformers/models/auto/tokenization_auto.py | 2 ++ tests/test_tokenization_auto.py | 12 ++---------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index a7e05d92e4..46e1dd9c0d 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -72,6 +72,7 @@ from .configuration_auto import ( MarianConfig, MBartConfig, MobileBertConfig, + MT5Config, OpenAIGPTConfig, PegasusConfig, ProphetNetConfig, @@ -173,6 +174,7 @@ TOKENIZER_MAPPING = OrderedDict( [ (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)), (T5Config, (T5Tokenizer, T5TokenizerFast)), + (MT5Config, (T5Tokenizer, T5TokenizerFast)), (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)), (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)), (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)), diff --git a/tests/test_tokenization_auto.py b/tests/test_tokenization_auto.py index 45f5635ab1..4fb7ece75c 100644 --- a/tests/test_tokenization_auto.py +++ b/tests/test_tokenization_auto.py @@ -99,21 +99,13 @@ class AutoTokenizerTest(unittest.TestCase): for mapping in mappings: mapping = tuple(mapping.items()) - for index, (child_config, (child_model_py, child_model_fast)) in enumerate(mapping[1:]): - for parent_config, (parent_model_py, parent_model_fast) in mapping[: index + 1]: + for index, (child_config, _) in enumerate(mapping[1:]): + for parent_config, _ in mapping[: index + 1]: with self.subTest( msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__) ): self.assertFalse(issubclass(child_config, parent_config)) - # Check for Slow tokenizer implementation if provided - if child_model_py and parent_model_py: - self.assertFalse(issubclass(child_model_py, parent_model_py)) - - # Check for Fast tokenizer implementation if provided - if child_model_fast and parent_model_fast: - self.assertFalse(issubclass(child_model_fast, parent_model_fast)) - @require_tokenizers def test_from_pretrained_use_fast_toggle(self): self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)