From e579f855fa7bce20b4bfb0bc8ba783bfa6601d10 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 24 Sep 2021 14:57:49 +0200 Subject: [PATCH] up (#13729) --- tests/test_tokenization_fnet.py | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/test_tokenization_fnet.py b/tests/test_tokenization_fnet.py index dda35c9251..7a4e5c1932 100644 --- a/tests/test_tokenization_fnet.py +++ b/tests/test_tokenization_fnet.py @@ -18,6 +18,7 @@ import unittest from transformers import FNetTokenizer, FNetTokenizerFast from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow +from transformers.tokenization_utils import AddedToken from .test_tokenization_common import TokenizerTesterMixin @@ -141,6 +142,56 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.sep_token_id ] + # Overriden Tests - loading the fast tokenizer from slow just takes too long + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + r_output = tokenizer_r.encode("Hey this is a token") + + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + + cr_output = tokenizer_r.encode("Hey this is a token") + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + @slow + def test_special_tokens_initialization_from_slow(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True + ) + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + cr_output = tokenizer_r.encode("Hey this is a token") + + self.assertEqual(p_output, cr_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + # Overriden Tests def test_padding(self, max_length=50): if not self.test_slow_tokenizer: