MbartTokenizer: do not hardcode vocab size (#5998)

2020-07-23 15:41:14 -04:00
parent 6e16195510
commit 9827d666eb
2 changed files with 43 additions and 30 deletions
--- a/tests/test_tokenization_mbart.py
+++ b/tests/test_tokenization_mbart.py
@@ -113,10 +113,15 @@ class MBartEnroIntegrationTest(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name)
+        cls.tokenizer: MBartTokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name)
        cls.pad_token_id = 1
        return cls

+    def check_language_codes(self):
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ar_AR"], 250001)
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["en_EN"], 250004)
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ro_RO"], 250020)
+
    def test_enro_tokenizer_prepare_translation_batch(self):
        batch = self.tokenizer.prepare_translation_batch(
            self.src_text, tgt_texts=self.tgt_text, max_length=len(self.expected_src_tokens),