[Tests] Speed up tokenizer tests (#14964)
* speed up canine and mluke * speed up mbart and mbart50 toks * upload files
This commit is contained in:
committed by
GitHub
parent
f80775df2b
commit
1bfa347707
@@ -39,11 +39,12 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
@cached_property
|
||||
def canine_tokenizer(self):
|
||||
# TODO replace nielsr by google
|
||||
return CanineTokenizer.from_pretrained("nielsr/canine-s")
|
||||
return CanineTokenizer.from_pretrained("google/canine-s")
|
||||
|
||||
def get_tokenizer(self, **kwargs) -> CanineTokenizer:
|
||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||
tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||
tokenizer._unicode_vocab_size = 1024
|
||||
return tokenizer
|
||||
|
||||
@require_torch
|
||||
def test_prepare_batch_integration(self):
|
||||
|
||||
@@ -1642,6 +1642,78 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(len(tokens[key].shape), 3)
|
||||
self.assertEqual(tokens[key].shape[-1], 4)
|
||||
|
||||
# overwrite from test_tokenization_common to speed up test
|
||||
def test_save_pretrained(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
return
|
||||
|
||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {})
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files + the tokenizer.json file for the fast one
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=True
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=False
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it saved the tokenizer.json file
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
@unittest.skip("TO DO: overwrite this very extensive test.")
|
||||
def test_alignement_methods(self):
|
||||
pass
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
@@ -122,6 +123,78 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
],
|
||||
)
|
||||
|
||||
# overwrite from test_tokenization_common to speed up test
|
||||
def test_save_pretrained(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
return
|
||||
|
||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files + the tokenizer.json file for the fast one
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=True
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=False
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it saved the tokenizer.json file
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_sentencepiece
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
@@ -34,7 +35,7 @@ RO_CODE = 250020
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = MBart50Tokenizer
|
||||
rust_tokenizer_class = MBart50TokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
@@ -113,11 +114,83 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
revision="d3913889c59cd5c9e456b269c376325eabad57e2",
|
||||
)
|
||||
|
||||
# overwrite from test_tokenization_common to speed up test
|
||||
def test_save_pretrained(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
return
|
||||
|
||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files + the tokenizer.json file for the fast one
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=True
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=False
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it saved the tokenizer.json file
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
class MBartOneToManyIntegrationTest(unittest.TestCase):
|
||||
class MBart50OneToManyIntegrationTest(unittest.TestCase):
|
||||
checkpoint_name = "facebook/mbart-large-50-one-to-many-mmt"
|
||||
src_text = [
|
||||
" UN Chief Says There Is No Military Solution in Syria",
|
||||
|
||||
@@ -70,9 +70,8 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
[35378, 8999, 38, 33273, 11676, 604, 365, 21392, 201, 1819],
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = self.tokenizer_class.from_pretrained("studio-ousia/mluke-base")
|
||||
tokenizer = self.tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-mluke")
|
||||
|
||||
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
||||
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
||||
|
||||
@@ -140,6 +140,78 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
],
|
||||
)
|
||||
|
||||
# overwrite from test_tokenization_common to speed up test
|
||||
def test_save_pretrained(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
|
||||
return
|
||||
|
||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files + the tokenizer.json file for the fast one
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=True
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=False
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it saved the tokenizer.json file
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
@cached_property
|
||||
def big_tokenizer(self):
|
||||
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
|
||||
|
||||
Reference in New Issue
Block a user