Use lru_cache for tokenization tests (#36818)
* fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -53,12 +53,13 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
test_sentencepiece = True
|
||||
from_pretrained_kwargs = {}
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
# We have a SentencePiece fixture for testing
|
||||
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
tokenizer.save_pretrained(cls.tmpdirname)
|
||||
|
||||
@require_torch
|
||||
def test_batch_tokenization(self):
|
||||
@@ -103,7 +104,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||
tokenizer_r = self.get_rust_tokenizer(
|
||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||
)
|
||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||
@@ -113,7 +114,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_token_id in r_output)
|
||||
|
||||
if self.test_slow_tokenizer:
|
||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
||||
tokenizer_cr = self.get_rust_tokenizer(
|
||||
pretrained_name,
|
||||
additional_special_tokens=added_tokens,
|
||||
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
||||
|
||||
Reference in New Issue
Block a user