[fsmt test] basic config test with online model + super tiny model (#7860)

* basic config test with online model

* typo

* style

* better test
This commit is contained in:
Stas Bekman
2020-10-22 06:14:54 -07:00
committed by GitHub
parent 3479787edc
commit 64b4d25cf3
3 changed files with 105 additions and 5 deletions

View File

@@ -25,6 +25,10 @@ from transformers.tokenization_fsmt import VOCAB_FILES_NAMES, FSMTTokenizer
from .test_tokenization_common import TokenizerTesterMixin
# using a different tiny model than the one used for default params defined in init to ensure proper testing
FSMT_TINY2 = "stas/tiny-wmt19-en-ru"
class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FSMTTokenizer
@@ -86,6 +90,15 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def tokenizer_en_ru(self):
return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru")
def test_online_tokenizer_config(self):
"""this just tests that the online tokenizer files get correctly fetched and
loaded via its tokenizer_config.json and it's not slow so it's run by normal CI
"""
tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2)
self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"])
self.assertEqual(tokenizer.src_vocab_size, 21)
self.assertEqual(tokenizer.tgt_vocab_size, 21)
def test_full_tokenizer(self):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file)