[fsmt test] basic config test with online model + super tiny model (#7860)
* basic config test with online model * typo * style * better test
This commit is contained in:
@@ -25,6 +25,10 @@ from transformers.tokenization_fsmt import VOCAB_FILES_NAMES, FSMTTokenizer
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
# using a different tiny model than the one used for default params defined in init to ensure proper testing
|
||||
FSMT_TINY2 = "stas/tiny-wmt19-en-ru"
|
||||
|
||||
|
||||
class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = FSMTTokenizer
|
||||
|
||||
@@ -86,6 +90,15 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def tokenizer_en_ru(self):
|
||||
return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru")
|
||||
|
||||
def test_online_tokenizer_config(self):
|
||||
"""this just tests that the online tokenizer files get correctly fetched and
|
||||
loaded via its tokenizer_config.json and it's not slow so it's run by normal CI
|
||||
"""
|
||||
tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2)
|
||||
self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"])
|
||||
self.assertEqual(tokenizer.src_vocab_size, 21)
|
||||
self.assertEqual(tokenizer.tgt_vocab_size, 21)
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
||||
tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file)
|
||||
|
||||
Reference in New Issue
Block a user