Save code of registered custom models (#15379)
* Allow dynamic modules to use relative imports * Work for configs * Fix last merge conflict * Save code of registered custom objects * Map strings to strings * Fix test * Add tokenizer * Rework tests * Tests * Ignore fixtures py files for tests * Tokenizer test + fix collection * With full path * Rework integration * Fix typo * Remove changes in conftest * Test for tokenizers * Add documentation * Update docs/source/custom_models.mdx Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Add file structure and file content * Add more doc * Style * Update docs/source/custom_models.mdx Co-authored-by: Suraj Patil <surajp815@gmail.com> * Address review comments Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Suraj Patil <surajp815@gmail.com>
This commit is contained in:
@@ -21,10 +21,12 @@ import os
|
||||
import pickle
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from collections import OrderedDict
|
||||
from itertools import takewhile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
||||
|
||||
from huggingface_hub import Repository, delete_repo, login
|
||||
@@ -67,6 +69,15 @@ if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
|
||||
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent / "utils"))
|
||||
|
||||
from test_module.custom_tokenization import CustomTokenizer # noqa E402
|
||||
|
||||
|
||||
if is_tokenizers_available():
|
||||
from test_module.custom_tokenization_fast import CustomTokenizerFast
|
||||
|
||||
|
||||
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
|
||||
|
||||
SMALL_TRAINING_CORPUS = [
|
||||
@@ -3690,28 +3701,6 @@ class TokenizerTesterMixin:
|
||||
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
|
||||
|
||||
|
||||
class FakeTokenizer(BertTokenizer):
|
||||
pass
|
||||
|
||||
|
||||
if is_tokenizers_available():
|
||||
|
||||
class FakeTokenizerFast(BertTokenizerFast):
|
||||
pass
|
||||
|
||||
|
||||
# Make sure this is synchronized with the tokenizers above.
|
||||
FAKE_TOKENIZER_CODE = """
|
||||
from transformers import BertTokenizer, BertTokenizerFast
|
||||
|
||||
class FakeTokenizer(BertTokenizer):
|
||||
pass
|
||||
|
||||
class FakeTokenizerFast(BertTokenizerFast):
|
||||
pass
|
||||
"""
|
||||
|
||||
|
||||
@is_staging_test
|
||||
class TokenizerPushToHubTester(unittest.TestCase):
|
||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
|
||||
@@ -3766,47 +3755,62 @@ class TokenizerPushToHubTester(unittest.TestCase):
|
||||
new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
|
||||
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
|
||||
|
||||
@require_tokenizers
|
||||
def test_push_to_hub_dynamic_tokenizer(self):
|
||||
CustomTokenizer.register_for_auto_class()
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
vocab_file = os.path.join(tmp_dir, "vocab.txt")
|
||||
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
|
||||
tokenizer = FakeTokenizer(vocab_file)
|
||||
tokenizer = CustomTokenizer(vocab_file)
|
||||
|
||||
# No fast custom tokenizer
|
||||
tokenizer._auto_map = ("tokenizer.FakeTokenizer", None)
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-tokenizer", use_auth_token=self._token)
|
||||
print(os.listdir((tmp_dir)))
|
||||
tokenizer.save_pretrained(tmp_dir)
|
||||
with open(os.path.join(tmp_dir, "tokenizer.py"), "w") as f:
|
||||
f.write(FAKE_TOKENIZER_CODE)
|
||||
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
|
||||
tokenizer_config = json.load(f)
|
||||
self.assertEqual(tokenizer_config["auto_map"], ["custom_tokenization.CustomTokenizer", None])
|
||||
|
||||
repo.push_to_hub()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
|
||||
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizer")
|
||||
# Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
|
||||
self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
|
||||
|
||||
# Fast and slow custom tokenizer
|
||||
tokenizer._auto_map = ("tokenizer.FakeTokenizer", "tokenizer.FakeTokenizerFast")
|
||||
CustomTokenizerFast.register_for_auto_class()
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
vocab_file = os.path.join(tmp_dir, "vocab.txt")
|
||||
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
|
||||
|
||||
bert_tokenizer = BertTokenizerFast.from_pretrained(tmp_dir)
|
||||
bert_tokenizer.save_pretrained(tmp_dir)
|
||||
tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-tokenizer", use_auth_token=self._token)
|
||||
print(os.listdir((tmp_dir)))
|
||||
tokenizer.save_pretrained(tmp_dir)
|
||||
with open(os.path.join(tmp_dir, "tokenizer.py"), "w") as f:
|
||||
f.write(FAKE_TOKENIZER_CODE)
|
||||
|
||||
with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
|
||||
tokenizer_config = json.load(f)
|
||||
self.assertEqual(
|
||||
tokenizer_config["auto_map"],
|
||||
["custom_tokenization.CustomTokenizer", "custom_tokenization_fast.CustomTokenizerFast"],
|
||||
)
|
||||
|
||||
repo.push_to_hub()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
|
||||
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizerFast")
|
||||
self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizerFast")
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
|
||||
)
|
||||
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizer")
|
||||
self.assertEqual(tokenizer.__class__.__name__, "CustomTokenizer")
|
||||
|
||||
|
||||
class TrieTest(unittest.TestCase):
|
||||
|
||||
Reference in New Issue
Block a user