Remote code improvements (#23959)

* Fix model load when it has both code on the Hub and locally

* Add input check with timeout

* Add tests

* Apply suggestions from code review

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

* Some non-saved stuff

* Add feature extractors

* Add image processor

* Add model

* Add processor and tokenizer

* Reduce timeout

---------

Co-authored-by: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
Sylvain Gugger
2023-06-06 14:31:14 -04:00
committed by GitHub
parent 60825f2c6e
commit f1660d7e23
13 changed files with 434 additions and 102 deletions

View File

@@ -21,6 +21,7 @@ import tempfile
import unittest
from pathlib import Path
import transformers
import transformers.models.auto
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
from transformers.models.bert.configuration_bert import BertConfig
@@ -37,6 +38,9 @@ SAMPLE_ROBERTA_CONFIG = get_tests_dir("fixtures/dummy-config.json")
class AutoConfigTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_module_spec(self):
self.assertIsNotNone(transformers.models.auto.__spec__)
self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto"))
@@ -108,6 +112,13 @@ class AutoConfigTest(unittest.TestCase):
_ = AutoConfig.from_pretrained("hf-internal-testing/no-config-test-repo")
def test_from_pretrained_dynamic_config(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfig")
@@ -116,3 +127,25 @@ class AutoConfigTest(unittest.TestCase):
config.save_pretrained(tmp_dir)
reloaded_config = AutoConfig.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_config.__class__.__name__, "NewModelConfig")
def test_from_pretrained_dynamic_config_conflict(self):
class NewModelConfigLocal(BertConfig):
model_type = "new-model"
try:
AutoConfig.register("new-model", NewModelConfigLocal)
# If remote code is not set, the default is to use local
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote code is disabled, we load the local one.
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote is enabled, we load from the Hub
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfig")
finally:
if "new-model" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["new-model"]

View File

@@ -19,6 +19,7 @@ import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
@@ -42,6 +43,9 @@ SAMPLE_CONFIG = get_tests_dir("fixtures/dummy-config.json")
class AutoFeatureExtractorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_feature_extractor_from_model_shortcut(self):
config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
@@ -96,6 +100,17 @@ class AutoFeatureExtractorTest(unittest.TestCase):
_ = AutoFeatureExtractor.from_pretrained("hf-internal-testing/config-no-model")
def test_from_pretrained_dynamic_feature_extractor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor"
)
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
@@ -127,3 +142,37 @@ class AutoFeatureExtractorTest(unittest.TestCase):
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_feature_extractor_conflict(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
# If remote code is not set, the default is to use local
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor"
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote code is disabled, we load the local one.
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote is enabled, we load from the Hub
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(not hasattr(feature_extractor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]

View File

@@ -19,6 +19,7 @@ import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
IMAGE_PROCESSOR_MAPPING,
@@ -37,6 +38,9 @@ from test_module.custom_image_processing import CustomImageProcessor # noqa E40
class AutoImageProcessorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_image_processor_from_model_shortcut(self):
config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.assertIsInstance(config, CLIPImageProcessor)
@@ -130,6 +134,15 @@ class AutoImageProcessorTest(unittest.TestCase):
_ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
def test_from_pretrained_dynamic_image_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
)
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
@@ -171,3 +184,35 @@ class AutoImageProcessorTest(unittest.TestCase):
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_image_processor_conflict(self):
class NewImageProcessor(CLIPImageProcessor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoImageProcessor.register(CustomConfig, NewImageProcessor)
# If remote code is not set, the default is to use local
image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote code is disabled, we load the local one.
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote is enabled, we load from the Hub
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(not hasattr(image_processor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]

View File

@@ -22,6 +22,7 @@ from pathlib import Path
import pytest
import transformers
from transformers import BertConfig, GPT2Model, is_safetensors_available, is_torch_available
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
from transformers.testing_utils import (
@@ -92,6 +93,9 @@ if is_torch_available():
@require_torch
class AutoModelTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@slow
def test_model_from_pretrained(self):
for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -312,6 +316,13 @@ class AutoModelTest(unittest.TestCase):
del MODEL_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_model_distant(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
@@ -416,6 +427,34 @@ class AutoModelTest(unittest.TestCase):
if CustomConfig in mapping._extra_content:
del mapping._extra_content[CustomConfig]
def test_from_pretrained_dynamic_model_conflict(self):
class NewModelConfigLocal(BertConfig):
model_type = "new-model"
class NewModel(BertModel):
config_class = NewModelConfigLocal
try:
AutoConfig.register("new-model", NewModelConfigLocal)
AutoModel.register(NewModelConfigLocal, NewModel)
# If remote code is not set, the default is to use local
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote code is disabled, we load the local one.
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote is enabled, we load from the Hub
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.config.__class__.__name__, "NewModelConfig")
finally:
if "new-model" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["new-model"]
if NewModelConfigLocal in MODEL_MAPPING._extra_content:
del MODEL_MAPPING._extra_content[NewModelConfigLocal]
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"

View File

@@ -24,6 +24,7 @@ from shutil import copyfile
from huggingface_hub import HfFolder, Repository, create_repo, delete_repo
from requests.exceptions import HTTPError
import transformers
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
@@ -33,6 +34,8 @@ from transformers import (
AutoFeatureExtractor,
AutoProcessor,
AutoTokenizer,
BertTokenizer,
ProcessorMixin,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
@@ -58,6 +61,9 @@ SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures")
class AutoFeatureExtractorTest(unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_processor_from_model_shortcut(self):
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
self.assertIsInstance(processor, Wav2Vec2Processor)
@@ -144,6 +150,15 @@ class AutoFeatureExtractorTest(unittest.TestCase):
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_from_pretrained_dynamic_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor", trust_remote_code=False
)
processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor", trust_remote_code=True)
self.assertTrue(processor.special_attribute_present)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
@@ -203,6 +218,58 @@ class AutoFeatureExtractorTest(unittest.TestCase):
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_processor_conflict(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
special_attribute_present = False
class NewTokenizer(BertTokenizer):
special_attribute_present = False
class NewProcessor(ProcessorMixin):
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "AutoTokenizer"
special_attribute_present = False
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor")
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote code is disabled, we load the local ones.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor", trust_remote_code=False
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote is enabled, we load from the Hub.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor", trust_remote_code=True
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertTrue(processor.special_attribute_present)
self.assertTrue(processor.feature_extractor.special_attribute_present)
self.assertTrue(processor.tokenizer.special_attribute_present)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_auto_processor_creates_tokenizer(self):
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")

View File

@@ -22,6 +22,7 @@ from pathlib import Path
import pytest
import transformers
from transformers import (
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -65,6 +66,9 @@ if is_tokenizers_available():
class AutoTokenizerTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@slow
def test_tokenizer_from_pretrained(self):
for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
@@ -298,6 +302,15 @@ class AutoTokenizerTest(unittest.TestCase):
del TOKENIZER_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_tokenizer(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
)
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
self.assertTrue(tokenizer.special_attribute_present)
# Test tokenizer can be reloaded.
@@ -326,6 +339,57 @@ class AutoTokenizerTest(unittest.TestCase):
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
@require_tokenizers
def test_from_pretrained_dynamic_tokenizer_conflict(self):
class NewTokenizer(BertTokenizer):
special_attribute_present = False
class NewTokenizerFast(BertTokenizerFast):
slow_tokenizer_class = NewTokenizer
special_attribute_present = False
try:
AutoConfig.register("custom", CustomConfig)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast)
# If remote code is not set, the default is to use local
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertFalse(tokenizer.special_attribute_present)
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
# If remote code is disabled, we load the local one.
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertFalse(tokenizer.special_attribute_present)
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
# If remote is enabled, we load from the Hub
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertTrue(tokenizer.special_attribute_present)
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertTrue(tokenizer.special_attribute_present)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True