Save code of registered custom models (#15379)

* Allow dynamic modules to use relative imports * Work for configs * Fix last merge conflict * Save code of registered custom objects * Map strings to strings * Fix test * Add tokenizer * Rework tests * Tests * Ignore fixtures py files for tests * Tokenizer test + fix collection * With full path * Rework integration * Fix typo * Remove changes in conftest * Test for tokenizers * Add documentation * Update docs/source/custom_models.mdx Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Add file structure and file content * Add more doc * Style * Update docs/source/custom_models.mdx Co-authored-by: Suraj Patil <surajp815@gmail.com> * Address review comments Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Suraj Patil <surajp815@gmail.com>
2022-02-02 10:44:37 -05:00
parent 623d8cb475
commit 44b21f117b
23 changed files with 630 additions and 295 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -21,10 +21,12 @@ import os
 import pickle
 import re
 import shutil
+import sys
 import tempfile
 import unittest
 from collections import OrderedDict
 from itertools import takewhile
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union

 from huggingface_hub import Repository, delete_repo, login
@@ -67,6 +69,15 @@ if TYPE_CHECKING:
    from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel


+sys.path.append(str(Path(__file__).parent.parent / "utils"))
+
+from test_module.custom_tokenization import CustomTokenizer  # noqa E402
+
+
+if is_tokenizers_available():
+    from test_module.custom_tokenization_fast import CustomTokenizerFast
+
+
 NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]

 SMALL_TRAINING_CORPUS = [
@@ -3690,28 +3701,6 @@ class TokenizerTesterMixin:
                    self.rust_tokenizer_class.from_pretrained(tmp_dir_2)


-class FakeTokenizer(BertTokenizer):
-    pass
-
-
-if is_tokenizers_available():
-
-    class FakeTokenizerFast(BertTokenizerFast):
-        pass
-
-
-# Make sure this is synchronized with the tokenizers above.
-FAKE_TOKENIZER_CODE = """
-from transformers import BertTokenizer, BertTokenizerFast
-
-class FakeTokenizer(BertTokenizer):
-    pass
-
-class FakeTokenizerFast(BertTokenizerFast):
-    pass
-"""
-
-
@is_staging_test
 class TokenizerPushToHubTester(unittest.TestCase):
    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
@@ -3766,47 +3755,62 @@ class TokenizerPushToHubTester(unittest.TestCase):
            new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
            self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)

+    @require_tokenizers
    def test_push_to_hub_dynamic_tokenizer(self):
+        CustomTokenizer.register_for_auto_class()
        with tempfile.TemporaryDirectory() as tmp_dir:
            vocab_file = os.path.join(tmp_dir, "vocab.txt")
            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
-            tokenizer = FakeTokenizer(vocab_file)
+            tokenizer = CustomTokenizer(vocab_file)

        # No fast custom tokenizer
-        tokenizer._auto_map = ("tokenizer.FakeTokenizer", None)
        with tempfile.TemporaryDirectory() as tmp_dir:
            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-tokenizer", use_auth_token=self._token)
-            print(os.listdir((tmp_dir)))
            tokenizer.save_pretrained(tmp_dir)
-            with open(os.path.join(tmp_dir, "tokenizer.py"), "w") as f:
-                f.write(FAKE_TOKENIZER_CODE)
+
+            with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
+                tokenizer_config = json.load(f)
+            self.assertEqual(tokenizer_config["auto_map"], ["custom_tokenization.CustomTokenizer", None])

            repo.push_to_hub()

        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
-        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizer")
+        # Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")

        # Fast and slow custom tokenizer
-        tokenizer._auto_map = ("tokenizer.FakeTokenizer", "tokenizer.FakeTokenizerFast")
+        CustomTokenizerFast.register_for_auto_class()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+
+            bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
+            bert_tokenizer.save_pretrained(tmp_dir)
+            tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
+
        with tempfile.TemporaryDirectory() as tmp_dir:
            repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-tokenizer", use_auth_token=self._token)
-            print(os.listdir((tmp_dir)))
            tokenizer.save_pretrained(tmp_dir)
-            with open(os.path.join(tmp_dir, "tokenizer.py"), "w") as f:
-                f.write(FAKE_TOKENIZER_CODE)
+
+            with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
+                tokenizer_config = json.load(f)
+            self.assertEqual(
+                tokenizer_config["auto_map"],
+                ["custom_tokenization.CustomTokenizer", "custom_tokenization_fast.CustomTokenizerFast"],
+            )

            repo.push_to_hub()

        tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizerFast")
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
        tokenizer = AutoTokenizer.from_pretrained(
            f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
        )
        # Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
-        self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizer")
+        self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")


 class TrieTest(unittest.TestCase):