Copy tokenizer files in each of their repo (#10624)

* Move tokenizer files in each repo * Fix mBART50 tests * Fix mBART tests * Fix Marian tests * Update templates
2021-03-10 11:26:23 -05:00
parent d26b37e744
commit 2295d783d5
38 changed files with 378 additions and 317 deletions
--- a/tests/test_tokenization_marian.py
+++ b/tests/test_tokenization_marian.py
@@ -26,7 +26,7 @@ from transformers.testing_utils import require_sentencepiece


 if is_sentencepiece_available():
-    from transformers.models.marian.tokenization_marian import save_json, vocab_files_names
+    from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json

 from .test_tokenization_common import TokenizerTesterMixin

@@ -50,11 +50,11 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        save_dir = Path(self.tmpdirname)
-        save_json(vocab_tokens, save_dir / vocab_files_names["vocab"])
-        save_json(mock_tokenizer_config, save_dir / vocab_files_names["tokenizer_config_file"])
-        if not (save_dir / vocab_files_names["source_spm"]).exists():
-            copyfile(SAMPLE_SP, save_dir / vocab_files_names["source_spm"])
-            copyfile(SAMPLE_SP, save_dir / vocab_files_names["target_spm"])
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
+        save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])

        tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
        tokenizer.save_pretrained(self.tmpdirname)