From f1660d7e23d4432513fe060bde4f9b7b29f05204 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 6 Jun 2023 14:31:14 -0400 Subject: [PATCH] Remote code improvements (#23959) * Fix model load when it has both code on the Hub and locally * Add input check with timeout * Add tests * Apply suggestions from code review Co-authored-by: Lysandre Debut * Some non-saved stuff * Add feature extractors * Add image processor * Add model * Add processor and tokenizer * Reduce timeout --------- Co-authored-by: Lysandre Debut --- src/transformers/dynamic_module_utils.py | 44 ++++++++++++ src/transformers/models/auto/auto_factory.py | 38 +++++------ .../models/auto/configuration_auto.py | 18 ++--- .../models/auto/feature_extraction_auto.py | 32 ++++----- .../models/auto/image_processing_auto.py | 32 ++++----- .../models/auto/processing_auto.py | 36 +++++----- .../models/auto/tokenization_auto.py | 39 +++++------ tests/models/auto/test_configuration_auto.py | 33 +++++++++ .../auto/test_feature_extraction_auto.py | 49 ++++++++++++++ .../models/auto/test_image_processing_auto.py | 45 +++++++++++++ tests/models/auto/test_modeling_auto.py | 39 +++++++++++ tests/models/auto/test_processor_auto.py | 67 +++++++++++++++++++ tests/models/auto/test_tokenization_auto.py | 64 ++++++++++++++++++ 13 files changed, 434 insertions(+), 102 deletions(-) diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index ae76e4ae1f..f106ccd213 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -18,6 +18,7 @@ import importlib import os import re import shutil +import signal import sys from pathlib import Path from typing import Dict, Optional, Union @@ -513,3 +514,46 @@ def custom_object_save(obj, folder, config=None): result.append(dest_file) return result + + +def _raise_timeout_error(signum, frame): + raise ValueError( + "Loading this model requires you to execute the configuration file in that repo on your local machine. We " + "asked if it was okay but did not get an answer. Make sure you have read the code there to avoid malicious " + "use, then set the option `trust_remote_code=True` to remove this error." + ) + + +TIME_OUT_REMOTE_CODE = 15 + + +def resolve_trust_remote_code(trust_remote_code, model_name, has_local_code, has_remote_code): + if trust_remote_code is None: + if has_local_code: + trust_remote_code = False + elif has_remote_code and TIME_OUT_REMOTE_CODE > 0: + signal.signal(signal.SIGALRM, _raise_timeout_error) + signal.alarm(TIME_OUT_REMOTE_CODE) + while trust_remote_code is None: + answer = input( + f"Loading {model_name} requires to execute some code in that repo, you can inspect the content of " + f"the repository at https://hf.co/{model_name}. You can dismiss this prompt by passing " + "`trust_remote_code=True`.\nDo you accept? [y/N] " + ) + if answer.lower() in ["yes", "y", "1"]: + trust_remote_code = True + elif answer.lower() in ["no", "n", "0", ""]: + trust_remote_code = False + signal.alarm(0) + elif has_remote_code: + # For the CI which puts the timeout at 0 + _raise_timeout_error(None, None) + + if has_remote_code and not has_local_code and not trust_remote_code: + raise ValueError( + f"Loading {model_name} requires you to execute the configuration file in that" + " repo on your local machine. Make sure you have read the code there to avoid malicious use, then" + " set the option `trust_remote_code=True` to remove this error." + ) + + return trust_remote_code diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index 8919341166..3c0aa8d63f 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -18,7 +18,7 @@ import importlib from collections import OrderedDict from ...configuration_utils import PretrainedConfig -from ...dynamic_module_utils import get_class_from_dynamic_module +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...utils import copy_func, logging, requires_backends from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings @@ -404,19 +404,14 @@ class _BaseAutoModelClass: @classmethod def from_config(cls, config, **kwargs): - trust_remote_code = kwargs.pop("trust_remote_code", False) - if hasattr(config, "auto_map") and cls.__name__ in config.auto_map: - if not trust_remote_code: - raise ValueError( - "Loading this model requires you to execute the modeling file in that repo " - "on your local machine. Make sure you have read the code there to avoid malicious use, then set " - "the option `trust_remote_code=True` to remove this error." - ) - if kwargs.get("revision", None) is None: - logger.warning( - "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure " - "no malicious code has been contributed in a newer revision." - ) + trust_remote_code = kwargs.pop("trust_remote_code", None) + has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map + has_local_code = type(config) in cls._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, config._name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: class_ref = config.auto_map[cls.__name__] if "--" in class_ref: repo_id, class_ref = class_ref.split("--") @@ -437,7 +432,7 @@ class _BaseAutoModelClass: @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): config = kwargs.pop("config", None) - trust_remote_code = kwargs.pop("trust_remote_code", False) + trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True hub_kwargs_names = [ "cache_dir", @@ -470,13 +465,12 @@ class _BaseAutoModelClass: if kwargs_orig.get("torch_dtype", None) == "auto": kwargs["torch_dtype"] = "auto" - if hasattr(config, "auto_map") and cls.__name__ in config.auto_map: - if not trust_remote_code: - raise ValueError( - f"Loading {pretrained_model_name_or_path} requires you to execute the modeling file in that repo " - "on your local machine. Make sure you have read the code there to avoid malicious use, then set " - "the option `trust_remote_code=True` to remove this error." - ) + has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map + has_local_code = type(config) in cls._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + if has_remote_code and trust_remote_code: class_ref = config.auto_map[cls.__name__] model_class = get_class_from_dynamic_module( class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 197d2b2e14..6445f62acb 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -20,7 +20,7 @@ from collections import OrderedDict from typing import List, Union from ...configuration_utils import PretrainedConfig -from ...dynamic_module_utils import get_class_from_dynamic_module +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...utils import CONFIG_NAME, logging @@ -940,15 +940,15 @@ class AutoConfig: ```""" kwargs["_from_auto"] = True kwargs["name_or_path"] = pretrained_model_name_or_path - trust_remote_code = kwargs.pop("trust_remote_code", False) + trust_remote_code = kwargs.pop("trust_remote_code", None) config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) - if "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]: - if not trust_remote_code: - raise ValueError( - f"Loading {pretrained_model_name_or_path} requires you to execute the configuration file in that" - " repo on your local machine. Make sure you have read the code there to avoid malicious use, then" - " set the option `trust_remote_code=True` to remove this error." - ) + has_remote_code = "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"] + has_local_code = "model_type" in config_dict and config_dict["model_type"] in CONFIG_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: class_ref = config_dict["auto_map"]["AutoConfig"] config_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs) _ = kwargs.pop("code_revision", None) diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 588f4b4d3d..681a59e0b2 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -21,7 +21,7 @@ from typing import Dict, Optional, Union # Build the list of all feature extractors from ...configuration_utils import PretrainedConfig -from ...dynamic_module_utils import get_class_from_dynamic_module +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...feature_extraction_utils import FeatureExtractionMixin from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging from .auto_factory import _LazyAutoMapping @@ -307,7 +307,7 @@ class AutoFeatureExtractor: >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/") ```""" config = kwargs.pop("config", None) - trust_remote_code = kwargs.pop("trust_remote_code", False) + trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) @@ -326,21 +326,21 @@ class AutoFeatureExtractor: feature_extractor_auto_map = config.auto_map["AutoFeatureExtractor"] if feature_extractor_class is not None: - # If we have custom code for a feature extractor, we get the proper class. - if feature_extractor_auto_map is not None: - if not trust_remote_code: - raise ValueError( - f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file " - "in that repo on your local machine. Make sure you have read the code there to avoid " - "malicious use, then set the option `trust_remote_code=True` to remove this error." - ) - feature_extractor_class = get_class_from_dynamic_module( - feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs - ) - _ = kwargs.pop("code_revision", None) - else: - feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class) + feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class) + has_remote_code = feature_extractor_auto_map is not None + has_local_code = feature_extractor_class is not None or type(config) in FEATURE_EXTRACTOR_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + feature_extractor_class = get_class_from_dynamic_module( + feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs + ) + _ = kwargs.pop("code_revision", None) + return feature_extractor_class.from_dict(config_dict, **kwargs) + elif feature_extractor_class is not None: return feature_extractor_class.from_dict(config_dict, **kwargs) # Last try: we use the FEATURE_EXTRACTOR_MAPPING. elif type(config) in FEATURE_EXTRACTOR_MAPPING: diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index cfa11f149c..ab02bd495a 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -21,7 +21,7 @@ from typing import Dict, Optional, Union # Build the list of all image processors from ...configuration_utils import PretrainedConfig -from ...dynamic_module_utils import get_class_from_dynamic_module +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...image_processing_utils import ImageProcessingMixin from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging from .auto_factory import _LazyAutoMapping @@ -314,7 +314,7 @@ class AutoImageProcessor: >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/") ```""" config = kwargs.pop("config", None) - trust_remote_code = kwargs.pop("trust_remote_code", False) + trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs) @@ -351,21 +351,21 @@ class AutoImageProcessor: image_processor_auto_map = config.auto_map["AutoImageProcessor"] if image_processor_class is not None: - # If we have custom code for a image processor, we get the proper class. - if image_processor_auto_map is not None: - if not trust_remote_code: - raise ValueError( - f"Loading {pretrained_model_name_or_path} requires you to execute the image processor file " - "in that repo on your local machine. Make sure you have read the code there to avoid " - "malicious use, then set the option `trust_remote_code=True` to remove this error." - ) - image_processor_class = get_class_from_dynamic_module( - image_processor_auto_map, pretrained_model_name_or_path, **kwargs - ) - _ = kwargs.pop("code_revision", None) - else: - image_processor_class = image_processor_class_from_name(image_processor_class) + image_processor_class = image_processor_class_from_name(image_processor_class) + has_remote_code = image_processor_auto_map is not None + has_local_code = image_processor_class is not None or type(config) in IMAGE_PROCESSOR_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + image_processor_class = get_class_from_dynamic_module( + image_processor_auto_map, pretrained_model_name_or_path, **kwargs + ) + _ = kwargs.pop("code_revision", None) + return image_processor_class.from_dict(config_dict, **kwargs) + elif image_processor_class is not None: return image_processor_class.from_dict(config_dict, **kwargs) # Last try: we use the IMAGE_PROCESSOR_MAPPING. elif type(config) in IMAGE_PROCESSOR_MAPPING: diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 691a9bb6d5..2dc8be70fe 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -20,7 +20,7 @@ from collections import OrderedDict # Build the list of all feature extractors from ...configuration_utils import PretrainedConfig -from ...dynamic_module_utils import get_class_from_dynamic_module +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...feature_extraction_utils import FeatureExtractionMixin from ...image_processing_utils import ImageProcessingMixin from ...tokenization_utils import TOKENIZER_CONFIG_FILE @@ -194,7 +194,7 @@ class AutoProcessor: >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/") ```""" config = kwargs.pop("config", None) - trust_remote_code = kwargs.pop("trust_remote_code", False) + trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True processor_class = None @@ -248,28 +248,28 @@ class AutoProcessor: processor_auto_map = config.auto_map["AutoProcessor"] if processor_class is not None: - # If we have custom code for a feature extractor, we get the proper class. - if processor_auto_map is not None: - if not trust_remote_code: - raise ValueError( - f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file " - "in that repo on your local machine. Make sure you have read the code there to avoid " - "malicious use, then set the option `trust_remote_code=True` to remove this error." - ) + processor_class = processor_class_from_name(processor_class) - processor_class = get_class_from_dynamic_module( - processor_auto_map, pretrained_model_name_or_path, **kwargs - ) - _ = kwargs.pop("code_revision", None) - else: - processor_class = processor_class_from_name(processor_class) + has_remote_code = processor_auto_map is not None + has_local_code = processor_class is not None or type(config) in PROCESSOR_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + if has_remote_code and trust_remote_code: + processor_class = get_class_from_dynamic_module( + processor_auto_map, pretrained_model_name_or_path, **kwargs + ) + _ = kwargs.pop("code_revision", None) + return processor_class.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + elif processor_class is not None: return processor_class.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs ) - # Last try: we use the PROCESSOR_MAPPING. - if type(config) in PROCESSOR_MAPPING: + elif type(config) in PROCESSOR_MAPPING: return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs) # At this stage, there doesn't seem to be a `Processor` class available for this model, so let's try a diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index aa4d5860a1..049310e629 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -21,7 +21,7 @@ from collections import OrderedDict from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from ...configuration_utils import PretrainedConfig -from ...dynamic_module_utils import get_class_from_dynamic_module +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging @@ -608,7 +608,7 @@ class AutoTokenizer: use_fast = kwargs.pop("use_fast", True) tokenizer_type = kwargs.pop("tokenizer_type", None) - trust_remote_code = kwargs.pop("trust_remote_code", False) + trust_remote_code = kwargs.pop("trust_remote_code", None) # First, let's see whether the tokenizer_type is passed so that we can leverage it if tokenizer_type is not None: @@ -662,31 +662,28 @@ class AutoTokenizer: if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map: tokenizer_auto_map = config.auto_map["AutoTokenizer"] - # If we have the tokenizer class from the tokenizer config or the model config we're good! - if config_tokenizer_class is not None: + has_remote_code = tokenizer_auto_map is not None + has_local_code = config_tokenizer_class is not None or type(config) in TOKENIZER_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + if use_fast and tokenizer_auto_map[1] is not None: + class_ref = tokenizer_auto_map[1] + else: + class_ref = tokenizer_auto_map[0] + tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs) + _ = kwargs.pop("code_revision", None) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif config_tokenizer_class is not None: tokenizer_class = None - if tokenizer_auto_map is not None: - if not trust_remote_code: - raise ValueError( - f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that" - " repo on your local machine. Make sure you have read the code there to avoid malicious use," - " then set the option `trust_remote_code=True` to remove this error." - ) - - if use_fast and tokenizer_auto_map[1] is not None: - class_ref = tokenizer_auto_map[1] - else: - class_ref = tokenizer_auto_map[0] - tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs) - _ = kwargs.pop("code_revision", None) - - elif use_fast and not config_tokenizer_class.endswith("Fast"): + if use_fast and not config_tokenizer_class.endswith("Fast"): tokenizer_class_candidate = f"{config_tokenizer_class}Fast" tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) if tokenizer_class is None: tokenizer_class_candidate = config_tokenizer_class tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) - if tokenizer_class is None: raise ValueError( f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." diff --git a/tests/models/auto/test_configuration_auto.py b/tests/models/auto/test_configuration_auto.py index 030a03aa6d..fa05952d29 100644 --- a/tests/models/auto/test_configuration_auto.py +++ b/tests/models/auto/test_configuration_auto.py @@ -21,6 +21,7 @@ import tempfile import unittest from pathlib import Path +import transformers import transformers.models.auto from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig from transformers.models.bert.configuration_bert import BertConfig @@ -37,6 +38,9 @@ SAMPLE_ROBERTA_CONFIG = get_tests_dir("fixtures/dummy-config.json") class AutoConfigTest(unittest.TestCase): + def setUp(self): + transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 + def test_module_spec(self): self.assertIsNotNone(transformers.models.auto.__spec__) self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto")) @@ -108,6 +112,13 @@ class AutoConfigTest(unittest.TestCase): _ = AutoConfig.from_pretrained("hf-internal-testing/no-config-test-repo") def test_from_pretrained_dynamic_config(self): + # If remote code is not set, we will time out when asking whether to load the model. + with self.assertRaises(ValueError): + config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model") + # If remote code is disabled, we can't load this config. + with self.assertRaises(ValueError): + config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False) + config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True) self.assertEqual(config.__class__.__name__, "NewModelConfig") @@ -116,3 +127,25 @@ class AutoConfigTest(unittest.TestCase): config.save_pretrained(tmp_dir) reloaded_config = AutoConfig.from_pretrained(tmp_dir, trust_remote_code=True) self.assertEqual(reloaded_config.__class__.__name__, "NewModelConfig") + + def test_from_pretrained_dynamic_config_conflict(self): + class NewModelConfigLocal(BertConfig): + model_type = "new-model" + + try: + AutoConfig.register("new-model", NewModelConfigLocal) + # If remote code is not set, the default is to use local + config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model") + self.assertEqual(config.__class__.__name__, "NewModelConfigLocal") + + # If remote code is disabled, we load the local one. + config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False) + self.assertEqual(config.__class__.__name__, "NewModelConfigLocal") + + # If remote is enabled, we load from the Hub + config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True) + self.assertEqual(config.__class__.__name__, "NewModelConfig") + + finally: + if "new-model" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["new-model"] diff --git a/tests/models/auto/test_feature_extraction_auto.py b/tests/models/auto/test_feature_extraction_auto.py index 35d3ac0fa4..ed50006741 100644 --- a/tests/models/auto/test_feature_extraction_auto.py +++ b/tests/models/auto/test_feature_extraction_auto.py @@ -19,6 +19,7 @@ import tempfile import unittest from pathlib import Path +import transformers from transformers import ( CONFIG_MAPPING, FEATURE_EXTRACTOR_MAPPING, @@ -42,6 +43,9 @@ SAMPLE_CONFIG = get_tests_dir("fixtures/dummy-config.json") class AutoFeatureExtractorTest(unittest.TestCase): + def setUp(self): + transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 + def test_feature_extractor_from_model_shortcut(self): config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") self.assertIsInstance(config, Wav2Vec2FeatureExtractor) @@ -96,6 +100,17 @@ class AutoFeatureExtractorTest(unittest.TestCase): _ = AutoFeatureExtractor.from_pretrained("hf-internal-testing/config-no-model") def test_from_pretrained_dynamic_feature_extractor(self): + # If remote code is not set, we will time out when asking whether to load the model. + with self.assertRaises(ValueError): + feature_extractor = AutoFeatureExtractor.from_pretrained( + "hf-internal-testing/test_dynamic_feature_extractor" + ) + # If remote code is disabled, we can't load this config. + with self.assertRaises(ValueError): + feature_extractor = AutoFeatureExtractor.from_pretrained( + "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False + ) + feature_extractor = AutoFeatureExtractor.from_pretrained( "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True ) @@ -127,3 +142,37 @@ class AutoFeatureExtractorTest(unittest.TestCase): del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] + + def test_from_pretrained_dynamic_feature_extractor_conflict(self): + class NewFeatureExtractor(Wav2Vec2FeatureExtractor): + is_local = True + + try: + AutoConfig.register("custom", CustomConfig) + AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor) + # If remote code is not set, the default is to use local + feature_extractor = AutoFeatureExtractor.from_pretrained( + "hf-internal-testing/test_dynamic_feature_extractor" + ) + self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor") + self.assertTrue(feature_extractor.is_local) + + # If remote code is disabled, we load the local one. + feature_extractor = AutoFeatureExtractor.from_pretrained( + "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False + ) + self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor") + self.assertTrue(feature_extractor.is_local) + + # If remote is enabled, we load from the Hub + feature_extractor = AutoFeatureExtractor.from_pretrained( + "hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True + ) + self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor") + self.assertTrue(not hasattr(feature_extractor, "is_local")) + + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: + del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py index 7b2296e71d..0fb22b6c2b 100644 --- a/tests/models/auto/test_image_processing_auto.py +++ b/tests/models/auto/test_image_processing_auto.py @@ -19,6 +19,7 @@ import tempfile import unittest from pathlib import Path +import transformers from transformers import ( CONFIG_MAPPING, IMAGE_PROCESSOR_MAPPING, @@ -37,6 +38,9 @@ from test_module.custom_image_processing import CustomImageProcessor # noqa E40 class AutoImageProcessorTest(unittest.TestCase): + def setUp(self): + transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 + def test_image_processor_from_model_shortcut(self): config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32") self.assertIsInstance(config, CLIPImageProcessor) @@ -130,6 +134,15 @@ class AutoImageProcessorTest(unittest.TestCase): _ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model") def test_from_pretrained_dynamic_image_processor(self): + # If remote code is not set, we will time out when asking whether to load the model. + with self.assertRaises(ValueError): + image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor") + # If remote code is disabled, we can't load this config. + with self.assertRaises(ValueError): + image_processor = AutoImageProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False + ) + image_processor = AutoImageProcessor.from_pretrained( "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True ) @@ -171,3 +184,35 @@ class AutoImageProcessorTest(unittest.TestCase): del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content: del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig] + + def test_from_pretrained_dynamic_image_processor_conflict(self): + class NewImageProcessor(CLIPImageProcessor): + is_local = True + + try: + AutoConfig.register("custom", CustomConfig) + AutoImageProcessor.register(CustomConfig, NewImageProcessor) + # If remote code is not set, the default is to use local + image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor") + self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor") + self.assertTrue(image_processor.is_local) + + # If remote code is disabled, we load the local one. + image_processor = AutoImageProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False + ) + self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor") + self.assertTrue(image_processor.is_local) + + # If remote is enabled, we load from the Hub + image_processor = AutoImageProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True + ) + self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor") + self.assertTrue(not hasattr(image_processor, "is_local")) + + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content: + del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig] diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py index 9c788a6155..347cabd38a 100644 --- a/tests/models/auto/test_modeling_auto.py +++ b/tests/models/auto/test_modeling_auto.py @@ -22,6 +22,7 @@ from pathlib import Path import pytest +import transformers from transformers import BertConfig, GPT2Model, is_safetensors_available, is_torch_available from transformers.models.auto.configuration_auto import CONFIG_MAPPING from transformers.testing_utils import ( @@ -92,6 +93,9 @@ if is_torch_available(): @require_torch class AutoModelTest(unittest.TestCase): + def setUp(self): + transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 + @slow def test_model_from_pretrained(self): for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: @@ -312,6 +316,13 @@ class AutoModelTest(unittest.TestCase): del MODEL_MAPPING._extra_content[CustomConfig] def test_from_pretrained_dynamic_model_distant(self): + # If remote code is not set, we will time out when asking whether to load the model. + with self.assertRaises(ValueError): + model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model") + # If remote code is disabled, we can't load this config. + with self.assertRaises(ValueError): + model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False) + model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True) self.assertEqual(model.__class__.__name__, "NewModel") @@ -416,6 +427,34 @@ class AutoModelTest(unittest.TestCase): if CustomConfig in mapping._extra_content: del mapping._extra_content[CustomConfig] + def test_from_pretrained_dynamic_model_conflict(self): + class NewModelConfigLocal(BertConfig): + model_type = "new-model" + + class NewModel(BertModel): + config_class = NewModelConfigLocal + + try: + AutoConfig.register("new-model", NewModelConfigLocal) + AutoModel.register(NewModelConfigLocal, NewModel) + # If remote code is not set, the default is to use local + model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model") + self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal") + + # If remote code is disabled, we load the local one. + model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False) + self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal") + + # If remote is enabled, we load from the Hub + model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True) + self.assertEqual(model.config.__class__.__name__, "NewModelConfig") + + finally: + if "new-model" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["new-model"] + if NewModelConfigLocal in MODEL_MAPPING._extra_content: + del MODEL_MAPPING._extra_content[NewModelConfigLocal] + def test_repo_not_found(self): with self.assertRaisesRegex( EnvironmentError, "bert-base is not a local folder and is not a valid model identifier" diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index e0bb4946f7..a4f3714268 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -24,6 +24,7 @@ from shutil import copyfile from huggingface_hub import HfFolder, Repository, create_repo, delete_repo from requests.exceptions import HTTPError +import transformers from transformers import ( CONFIG_MAPPING, FEATURE_EXTRACTOR_MAPPING, @@ -33,6 +34,8 @@ from transformers import ( AutoFeatureExtractor, AutoProcessor, AutoTokenizer, + BertTokenizer, + ProcessorMixin, Wav2Vec2Config, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, @@ -58,6 +61,9 @@ SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures") class AutoFeatureExtractorTest(unittest.TestCase): vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"] + def setUp(self): + transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 + def test_processor_from_model_shortcut(self): processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") self.assertIsInstance(processor, Wav2Vec2Processor) @@ -144,6 +150,15 @@ class AutoFeatureExtractorTest(unittest.TestCase): self.assertIsInstance(processor, Wav2Vec2Processor) def test_from_pretrained_dynamic_processor(self): + # If remote code is not set, we will time out when asking whether to load the model. + with self.assertRaises(ValueError): + processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor") + # If remote code is disabled, we can't load this config. + with self.assertRaises(ValueError): + processor = AutoProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_processor", trust_remote_code=False + ) + processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor", trust_remote_code=True) self.assertTrue(processor.special_attribute_present) self.assertEqual(processor.__class__.__name__, "NewProcessor") @@ -203,6 +218,58 @@ class AutoFeatureExtractorTest(unittest.TestCase): if CustomConfig in PROCESSOR_MAPPING._extra_content: del PROCESSOR_MAPPING._extra_content[CustomConfig] + def test_from_pretrained_dynamic_processor_conflict(self): + class NewFeatureExtractor(Wav2Vec2FeatureExtractor): + special_attribute_present = False + + class NewTokenizer(BertTokenizer): + special_attribute_present = False + + class NewProcessor(ProcessorMixin): + feature_extractor_class = "AutoFeatureExtractor" + tokenizer_class = "AutoTokenizer" + special_attribute_present = False + + try: + AutoConfig.register("custom", CustomConfig) + AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor) + AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) + AutoProcessor.register(CustomConfig, NewProcessor) + # If remote code is not set, the default is to use local classes. + processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor") + self.assertEqual(processor.__class__.__name__, "NewProcessor") + self.assertFalse(processor.special_attribute_present) + self.assertFalse(processor.feature_extractor.special_attribute_present) + self.assertFalse(processor.tokenizer.special_attribute_present) + + # If remote code is disabled, we load the local ones. + processor = AutoProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_processor", trust_remote_code=False + ) + self.assertEqual(processor.__class__.__name__, "NewProcessor") + self.assertFalse(processor.special_attribute_present) + self.assertFalse(processor.feature_extractor.special_attribute_present) + self.assertFalse(processor.tokenizer.special_attribute_present) + + # If remote is enabled, we load from the Hub. + processor = AutoProcessor.from_pretrained( + "hf-internal-testing/test_dynamic_processor", trust_remote_code=True + ) + self.assertEqual(processor.__class__.__name__, "NewProcessor") + self.assertTrue(processor.special_attribute_present) + self.assertTrue(processor.feature_extractor.special_attribute_present) + self.assertTrue(processor.tokenizer.special_attribute_present) + + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: + del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] + if CustomConfig in TOKENIZER_MAPPING._extra_content: + del TOKENIZER_MAPPING._extra_content[CustomConfig] + if CustomConfig in PROCESSOR_MAPPING._extra_content: + del PROCESSOR_MAPPING._extra_content[CustomConfig] + def test_auto_processor_creates_tokenizer(self): processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert") self.assertEqual(processor.__class__.__name__, "BertTokenizerFast") diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py index a919ac3eda..a3a7760838 100644 --- a/tests/models/auto/test_tokenization_auto.py +++ b/tests/models/auto/test_tokenization_auto.py @@ -22,6 +22,7 @@ from pathlib import Path import pytest +import transformers from transformers import ( BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -65,6 +66,9 @@ if is_tokenizers_available(): class AutoTokenizerTest(unittest.TestCase): + def setUp(self): + transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 + @slow def test_tokenizer_from_pretrained(self): for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x): @@ -298,6 +302,15 @@ class AutoTokenizerTest(unittest.TestCase): del TOKENIZER_MAPPING._extra_content[CustomConfig] def test_from_pretrained_dynamic_tokenizer(self): + # If remote code is not set, we will time out when asking whether to load the model. + with self.assertRaises(ValueError): + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer") + # If remote code is disabled, we can't load this config. + with self.assertRaises(ValueError): + tokenizer = AutoTokenizer.from_pretrained( + "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False + ) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True) self.assertTrue(tokenizer.special_attribute_present) # Test tokenizer can be reloaded. @@ -326,6 +339,57 @@ class AutoTokenizerTest(unittest.TestCase): self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer") + @require_tokenizers + def test_from_pretrained_dynamic_tokenizer_conflict(self): + class NewTokenizer(BertTokenizer): + special_attribute_present = False + + class NewTokenizerFast(BertTokenizerFast): + slow_tokenizer_class = NewTokenizer + special_attribute_present = False + + try: + AutoConfig.register("custom", CustomConfig) + AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) + AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast) + # If remote code is not set, the default is to use local + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer") + self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") + self.assertFalse(tokenizer.special_attribute_present) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False) + self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") + self.assertFalse(tokenizer.special_attribute_present) + + # If remote code is disabled, we load the local one. + tokenizer = AutoTokenizer.from_pretrained( + "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False + ) + self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") + self.assertFalse(tokenizer.special_attribute_present) + tokenizer = AutoTokenizer.from_pretrained( + "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False + ) + self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") + self.assertFalse(tokenizer.special_attribute_present) + + # If remote is enabled, we load from the Hub + tokenizer = AutoTokenizer.from_pretrained( + "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True + ) + self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") + self.assertTrue(tokenizer.special_attribute_present) + tokenizer = AutoTokenizer.from_pretrained( + "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False + ) + self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer") + self.assertTrue(tokenizer.special_attribute_present) + + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + if CustomConfig in TOKENIZER_MAPPING._extra_content: + del TOKENIZER_MAPPING._extra_content[CustomConfig] + def test_from_pretrained_dynamic_tokenizer_legacy_format(self): tokenizer = AutoTokenizer.from_pretrained( "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True