Copy tokenizer files in each of their repo (#10624)

* Move tokenizer files in each repo * Fix mBART50 tests * Fix mBART tests * Fix Marian tests * Update templates
2021-03-10 11:26:23 -05:00
parent d26b37e744
commit 2295d783d5
38 changed files with 378 additions and 317 deletions
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
@@ -65,6 +65,8 @@ from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.ca

 logger = logging.get_logger(__name__)

+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
@@ -93,6 +95,7 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
    parameters.
    """

+    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
@@ -109,9 +112,16 @@ from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.ca

 logger = logging.get_logger(__name__)

-VOCAB_FILES_NAMES = {}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

-PRETRAINED_VOCAB_FILES_MAP = {}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
+    },
+    "tokenizer_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json",
+    },
+}

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "{{cookiecutter.checkpoint_identifier}}": 1024,
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -62,6 +62,8 @@ from ..bart.tokenization_bart import BartTokenizer

 logger = logging.get_logger(__name__)

+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
@@ -90,6 +92,7 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
    parameters.
    """

+    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

@@ -105,9 +108,13 @@ from ...utils import logging

 logger = logging.get_logger(__name__)

-VOCAB_FILES_NAMES = {}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

-PRETRAINED_VOCAB_FILES_MAP = {}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
+    },
+}

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "{{cookiecutter.checkpoint_identifier}}": 1024,