Expand dynamic supported objects to configs and tokenizers (#14296)
* Dynamic configs * Add config test * Better tests * Add tokenizer and test * Add to from_config * With save
This commit is contained in:
@@ -19,9 +19,9 @@ import os
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from huggingface_hub import delete_repo, login
|
||||
from huggingface_hub import Repository, delete_repo, login
|
||||
from requests.exceptions import HTTPError
|
||||
from transformers import BertConfig, GPT2Config, is_torch_available
|
||||
from transformers import AutoConfig, BertConfig, GPT2Config, is_torch_available
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.testing_utils import PASS, USER, is_staging_test
|
||||
|
||||
@@ -190,6 +190,23 @@ class ConfigTester(object):
|
||||
self.check_config_arguments_init()
|
||||
|
||||
|
||||
class FakeConfig(PretrainedConfig):
|
||||
def __init__(self, attribute=1, **kwargs):
|
||||
self.attribute = attribute
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
# Make sure this is synchronized with the config above.
|
||||
FAKE_CONFIG_CODE = """
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
class FakeConfig(PretrainedConfig):
|
||||
def __init__(self, attribute=1, **kwargs):
|
||||
self.attribute = attribute
|
||||
super().__init__(**kwargs)
|
||||
"""
|
||||
|
||||
|
||||
@is_staging_test
|
||||
class ConfigPushToHubTester(unittest.TestCase):
|
||||
@classmethod
|
||||
@@ -208,6 +225,11 @@ class ConfigPushToHubTester(unittest.TestCase):
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
try:
|
||||
delete_repo(token=cls._token, name="test-dynamic-config")
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
def test_push_to_hub(self):
|
||||
config = BertConfig(
|
||||
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
|
||||
@@ -238,6 +260,23 @@ class ConfigPushToHubTester(unittest.TestCase):
|
||||
if k != "transformers_version":
|
||||
self.assertEqual(v, getattr(new_config, k))
|
||||
|
||||
def test_push_to_hub_dynamic_config(self):
|
||||
config = FakeConfig(attribute=42)
|
||||
config.auto_map = {"AutoConfig": "configuration.FakeConfig"}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-config", use_auth_token=self._token)
|
||||
config.save_pretrained(tmp_dir)
|
||||
with open(os.path.join(tmp_dir, "configuration.py"), "w") as f:
|
||||
f.write(FAKE_CONFIG_CODE)
|
||||
|
||||
repo.push_to_hub()
|
||||
|
||||
new_config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-config", trust_remote_code=True)
|
||||
# Can't make an isinstance check because the new_config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(new_config.__class__.__name__, "FakeConfig")
|
||||
self.assertEqual(new_config.attribute, 42)
|
||||
|
||||
|
||||
class ConfigTestUtils(unittest.TestCase):
|
||||
def test_config_from_string(self):
|
||||
|
||||
@@ -30,7 +30,14 @@ import numpy as np
|
||||
import transformers
|
||||
from huggingface_hub import Repository, delete_repo, login
|
||||
from requests.exceptions import HTTPError
|
||||
from transformers import AutoModel, AutoModelForSequenceClassification, is_torch_available, logging
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForSequenceClassification,
|
||||
PretrainedConfig,
|
||||
is_torch_available,
|
||||
logging,
|
||||
)
|
||||
from transformers.file_utils import WEIGHTS_NAME, is_flax_available, is_torch_fx_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import (
|
||||
@@ -67,7 +74,6 @@ if is_torch_available():
|
||||
AdaptiveEmbedding,
|
||||
BertConfig,
|
||||
BertModel,
|
||||
PretrainedConfig,
|
||||
PreTrainedModel,
|
||||
T5Config,
|
||||
T5ForConditionalGeneration,
|
||||
@@ -2078,6 +2084,23 @@ class ModelUtilsTest(TestCasePlus):
|
||||
self.assertEqual(model.dtype, torch.float16)
|
||||
|
||||
|
||||
class FakeConfig(PretrainedConfig):
|
||||
def __init__(self, attribute=1, **kwargs):
|
||||
self.attribute = attribute
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
# Make sure this is synchronized with the config above.
|
||||
FAKE_CONFIG_CODE = """
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
class FakeConfig(PretrainedConfig):
|
||||
def __init__(self, attribute=1, **kwargs):
|
||||
self.attribute = attribute
|
||||
super().__init__(**kwargs)
|
||||
"""
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
|
||||
class FakeModel(PreTrainedModel):
|
||||
@@ -2140,6 +2163,11 @@ class ModelPushToHubTester(unittest.TestCase):
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
try:
|
||||
delete_repo(token=cls._token, name="test-dynamic-model-config")
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
def test_push_to_hub(self):
|
||||
config = BertConfig(
|
||||
vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
|
||||
@@ -2185,5 +2213,47 @@ class ModelPushToHubTester(unittest.TestCase):
|
||||
repo.push_to_hub()
|
||||
|
||||
new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True)
|
||||
# Can't make an isinstance check because the new_model is from the FakeModel class of a dynamic module
|
||||
self.assertEqual(new_model.__class__.__name__, "FakeModel")
|
||||
for p1, p2 in zip(model.parameters(), new_model.parameters()):
|
||||
self.assertTrue(torch.equal(p1, p2))
|
||||
|
||||
config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model")
|
||||
new_model = AutoModel.from_config(config, trust_remote_code=True)
|
||||
self.assertEqual(new_model.__class__.__name__, "FakeModel")
|
||||
|
||||
def test_push_to_hub_dynamic_model_and_config(self):
|
||||
config = FakeConfig(
|
||||
attribute=42,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
)
|
||||
config.auto_map = {"AutoConfig": "configuration.FakeConfig", "AutoModel": "modeling.FakeModel"}
|
||||
model = FakeModel(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-model-config", use_auth_token=self._token)
|
||||
model.save_pretrained(tmp_dir)
|
||||
with open(os.path.join(tmp_dir, "configuration.py"), "w") as f:
|
||||
f.write(FAKE_CONFIG_CODE)
|
||||
with open(os.path.join(tmp_dir, "modeling.py"), "w") as f:
|
||||
f.write(FAKE_MODEL_CODE)
|
||||
|
||||
repo.push_to_hub()
|
||||
|
||||
new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model-config", trust_remote_code=True)
|
||||
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(new_model.config.__class__.__name__, "FakeConfig")
|
||||
self.assertEqual(new_model.config.attribute, 42)
|
||||
|
||||
# Can't make an isinstance check because the new_model is from the FakeModel class of a dynamic module
|
||||
self.assertEqual(new_model.__class__.__name__, "FakeModel")
|
||||
for p1, p2 in zip(model.parameters(), new_model.parameters()):
|
||||
self.assertTrue(torch.equal(p1, p2))
|
||||
|
||||
config = AutoConfig.from_pretrained(f"{USER}/test-dynamic-model")
|
||||
new_model = AutoModel.from_config(config, trust_remote_code=True)
|
||||
self.assertEqual(new_model.__class__.__name__, "FakeModel")
|
||||
|
||||
@@ -27,11 +27,12 @@ from collections import OrderedDict
|
||||
from itertools import takewhile
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
||||
|
||||
from huggingface_hub import delete_repo, login
|
||||
from huggingface_hub import Repository, delete_repo, login
|
||||
from requests.exceptions import HTTPError
|
||||
from transformers import (
|
||||
AlbertTokenizer,
|
||||
AlbertTokenizerFast,
|
||||
AutoTokenizer,
|
||||
BertTokenizer,
|
||||
BertTokenizerFast,
|
||||
PreTrainedTokenizer,
|
||||
@@ -41,6 +42,7 @@ from transformers import (
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
is_tf_available,
|
||||
is_tokenizers_available,
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
@@ -3513,6 +3515,28 @@ class TokenizerTesterMixin:
|
||||
self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint")))
|
||||
|
||||
|
||||
class FakeTokenizer(BertTokenizer):
|
||||
pass
|
||||
|
||||
|
||||
if is_tokenizers_available():
|
||||
|
||||
class FakeTokenizerFast(BertTokenizerFast):
|
||||
pass
|
||||
|
||||
|
||||
# Make sure this is synchronized with the tokenizers above.
|
||||
FAKE_TOKENIZER_CODE = """
|
||||
from transformers import BertTokenizer, BertTokenizerFast
|
||||
|
||||
class FakeTokenizer(BertTokenizer):
|
||||
pass
|
||||
|
||||
class FakeTokenizerFast(BertTokenizerFast):
|
||||
pass
|
||||
"""
|
||||
|
||||
|
||||
@is_staging_test
|
||||
class TokenizerPushToHubTester(unittest.TestCase):
|
||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
|
||||
@@ -3533,6 +3557,11 @@ class TokenizerPushToHubTester(unittest.TestCase):
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
try:
|
||||
delete_repo(token=cls._token, name="test-dynamic-tokenizer")
|
||||
except HTTPError:
|
||||
pass
|
||||
|
||||
def test_push_to_hub(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
vocab_file = os.path.join(tmp_dir, "vocab.txt")
|
||||
@@ -3562,6 +3591,48 @@ class TokenizerPushToHubTester(unittest.TestCase):
|
||||
new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
|
||||
self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
|
||||
|
||||
def test_push_to_hub_dynamic_tokenizer(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
vocab_file = os.path.join(tmp_dir, "vocab.txt")
|
||||
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
|
||||
tokenizer = FakeTokenizer(vocab_file)
|
||||
|
||||
# No fast custom tokenizer
|
||||
tokenizer._auto_map = ("tokenizer.FakeTokenizer", None)
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-tokenizer", use_auth_token=self._token)
|
||||
print(os.listdir((tmp_dir)))
|
||||
tokenizer.save_pretrained(tmp_dir)
|
||||
with open(os.path.join(tmp_dir, "tokenizer.py"), "w") as f:
|
||||
f.write(FAKE_TOKENIZER_CODE)
|
||||
|
||||
repo.push_to_hub()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
|
||||
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizer")
|
||||
|
||||
# Fast and slow custom tokenizer
|
||||
tokenizer._auto_map = ("tokenizer.FakeTokenizer", "tokenizer.FakeTokenizerFast")
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
repo = Repository(tmp_dir, clone_from=f"{USER}/test-dynamic-tokenizer", use_auth_token=self._token)
|
||||
print(os.listdir((tmp_dir)))
|
||||
tokenizer.save_pretrained(tmp_dir)
|
||||
with open(os.path.join(tmp_dir, "tokenizer.py"), "w") as f:
|
||||
f.write(FAKE_TOKENIZER_CODE)
|
||||
|
||||
repo.push_to_hub()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"{USER}/test-dynamic-tokenizer", trust_remote_code=True)
|
||||
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizerFast")
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
f"{USER}/test-dynamic-tokenizer", use_fast=False, trust_remote_code=True
|
||||
)
|
||||
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
|
||||
self.assertEqual(tokenizer.__class__.__name__, "FakeTokenizer")
|
||||
|
||||
|
||||
class TrieTest(unittest.TestCase):
|
||||
def test_trie(self):
|
||||
|
||||
Reference in New Issue
Block a user