Use code on the Hub from another repo (#22814)

* initial work * Add other classes * Refactor code * Move warning and fix dynamic pipeline * Issue warning when necessary * Add test * Do not skip auto tests * Fix failing tests * Refactor and address review comments * Address review comments
2023-04-18 13:46:11 -04:00
parent aec10d162f
commit 5f9b825c89
15 changed files with 124 additions and 66 deletions
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -30,6 +30,7 @@ from .dynamic_module_utils import custom_object_save
 from .utils import (
    CONFIG_NAME,
    PushToHubMixin,
+    add_model_info_to_auto_map,
    cached_file,
    copy_func,
    download_url,
@@ -667,6 +668,10 @@ class PretrainedConfig(PushToHubMixin):
        else:
            logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")

+        if "auto_map" in config_dict and not is_local:
+            config_dict["auto_map"] = add_model_info_to_auto_map(
+                config_dict["auto_map"], pretrained_model_name_or_path
+            )
        return config_dict, kwargs

    @classmethod
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -29,6 +29,7 @@ from .utils import (
    extract_commit_hash,
    is_offline_mode,
    logging,
+    try_to_load_from_cache,
 )


@@ -222,11 +223,16 @@ def get_cached_module_file(

    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-    if os.path.isdir(pretrained_model_name_or_path):
+    is_local = os.path.isdir(pretrained_model_name_or_path)
+    if is_local:
        submodule = pretrained_model_name_or_path.split(os.path.sep)[-1]
    else:
        submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
+        cached_module = try_to_load_from_cache(
+            pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash
+        )

+    new_files = []
    try:
        # Load from URL or cache if already cached
        resolved_module_file = cached_file(
@@ -241,6 +247,8 @@ def get_cached_module_file(
            revision=revision,
            _commit_hash=_commit_hash,
        )
+        if not is_local and cached_module != resolved_module_file:
+            new_files.append(module_file)

    except EnvironmentError:
        logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
@@ -284,7 +292,7 @@ def get_cached_module_file(
            importlib.invalidate_caches()
        # Make sure we also have every file with relative
        for module_needed in modules_needed:
-            if not (submodule_path / module_needed).exists():
+            if not (submodule_path / f"{module_needed}.py").exists():
                get_cached_module_file(
                    pretrained_model_name_or_path,
                    f"{module_needed}.py",
@@ -295,14 +303,24 @@ def get_cached_module_file(
                    use_auth_token=use_auth_token,
                    revision=revision,
                    local_files_only=local_files_only,
+                    _commit_hash=commit_hash,
                )
+                new_files.append(f"{module_needed}.py")
+
+    if len(new_files) > 0:
+        new_files = "\n".join([f"- {f}" for f in new_files])
+        logger.warning(
+            f"A new version of the following files was downloaded from {pretrained_model_name_or_path}:\n{new_files}"
+            "\n. Make sure to double-check they do not contain any added malicious code. To avoid downloading new "
+            "versions of the code file, you can pin a revision."
+        )
+
    return os.path.join(full_submodule, module_file)


 def get_class_from_dynamic_module(
+    class_reference: str,
    pretrained_model_name_or_path: Union[str, os.PathLike],
-    module_file: str,
-    class_name: str,
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: bool = False,
@@ -323,6 +341,8 @@ def get_class_from_dynamic_module(
    </Tip>

    Args:
+        class_reference (`str`):
+            The full name of the class to load, including its module and optionally its repo.
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:

@@ -332,6 +352,7 @@ def get_class_from_dynamic_module(
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.

+            This is used when `class_reference` does not specify another repo.
        module_file (`str`):
            The name of the module file containing the class to look for.
        class_name (`str`):
@@ -371,12 +392,25 @@ def get_class_from_dynamic_module(
    ```python
    # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
    # module.
-    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+    cls = get_class_from_dynamic_module("modeling.MyBertModel", "sgugger/my-bert-model")
+
+    # Download module `modeling.py` from a given repo and cache then extract the class `MyBertModel` from this
+    # module.
+    cls = get_class_from_dynamic_module("sgugger/my-bert-model--modeling.MyBertModel", "sgugger/another-bert-model")
    ```"""
+    # Catch the name of the repo if it's specified in `class_reference`
+    if "--" in class_reference:
+        repo_id, class_reference = class_reference.split("--")
+        # Invalidate revision since it's not relevant for this repo
+        revision = "main"
+    else:
+        repo_id = pretrained_model_name_or_path
+    module_file, class_name = class_reference.split(".")
+
    # And lastly we get the class inside our newly created module
    final_module = get_cached_module_file(
-        pretrained_model_name_or_path,
-        module_file,
+        repo_id,
+        module_file + ".py",
        cache_dir=cache_dir,
        force_download=force_download,
        resume_download=resume_download,
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -29,6 +29,7 @@ from .utils import (
    FEATURE_EXTRACTOR_NAME,
    PushToHubMixin,
    TensorType,
+    add_model_info_to_auto_map,
    cached_file,
    copy_func,
    download_url,
@@ -469,6 +470,11 @@ class FeatureExtractionMixin(PushToHubMixin):
                f"loading configuration file {feature_extractor_file} from cache at {resolved_feature_extractor_file}"
            )

+        if "auto_map" in feature_extractor_dict and not is_local:
+            feature_extractor_dict["auto_map"] = add_model_info_to_auto_map(
+                feature_extractor_dict["auto_map"], pretrained_model_name_or_path
+            )
+
        return feature_extractor_dict, kwargs

    @classmethod
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -25,6 +25,7 @@ from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .utils import (
    IMAGE_PROCESSOR_NAME,
    PushToHubMixin,
+    add_model_info_to_auto_map,
    cached_file,
    copy_func,
    download_url,
@@ -309,6 +310,11 @@ class ImageProcessingMixin(PushToHubMixin):
                f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
            )

+        if "auto_map" in image_processor_dict and not is_local:
+            image_processor_dict["auto_map"] = add_model_info_to_auto_map(
+                image_processor_dict["auto_map"], pretrained_model_name_or_path
+            )
+
        return image_processor_dict, kwargs

    @classmethod
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -403,8 +403,12 @@ class _BaseAutoModelClass:
                    "no malicious code has been contributed in a newer revision."
                )
            class_ref = config.auto_map[cls.__name__]
+            if "--" in class_ref:
+                repo_id, class_ref = class_ref.split("--")
+            else:
+                repo_id = config.name_or_path
            module_file, class_name = class_ref.split(".")
-            model_class = get_class_from_dynamic_module(config.name_or_path, module_file + ".py", class_name, **kwargs)
+            model_class = get_class_from_dynamic_module(repo_id, module_file + ".py", class_name, **kwargs)
            return model_class._from_config(config, **kwargs)
        elif type(config) in cls._model_mapping.keys():
            model_class = _get_model_class(config, cls._model_mapping)
@@ -452,17 +456,10 @@ class _BaseAutoModelClass:
                    "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
                    "the option `trust_remote_code=True` to remove this error."
                )
-            if hub_kwargs.get("revision", None) is None:
-                logger.warning(
-                    "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
-                    "no malicious code has been contributed in a newer revision."
-                )
            class_ref = config.auto_map[cls.__name__]
-            module_file, class_name = class_ref.split(".")
            model_class = get_class_from_dynamic_module(
-                pretrained_model_name_or_path, module_file + ".py", class_name, **hub_kwargs, **kwargs
+                class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
            )
-            model_class.register_for_auto_class(cls.__name__)
            return model_class.from_pretrained(
                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
            )
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -921,17 +921,8 @@ class AutoConfig:
                    " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
                    " set the option `trust_remote_code=True` to remove this error."
                )
-            if kwargs.get("revision", None) is None:
-                logger.warning(
-                    "Explicitly passing a `revision` is encouraged when loading a configuration with custom code to "
-                    "ensure no malicious code has been contributed in a newer revision."
-                )
            class_ref = config_dict["auto_map"]["AutoConfig"]
-            module_file, class_name = class_ref.split(".")
-            config_class = get_class_from_dynamic_module(
-                pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
-            )
-            config_class.register_for_auto_class()
+            config_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
            return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "model_type" in config_dict:
            config_class = CONFIG_MAPPING[config_dict["model_type"]]
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -333,17 +333,9 @@ class AutoFeatureExtractor:
                        "in that repo on your local machine. Make sure you have read the code there to avoid "
                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a feature extractor with custom "
-                        "code to ensure no malicious code has been contributed in a newer revision."
-                    )
-
-                module_file, class_name = feature_extractor_auto_map.split(".")
                feature_extractor_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                    feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs
                )
-                feature_extractor_class.register_for_auto_class()
            else:
                feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class)

--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -355,17 +355,9 @@ class AutoImageProcessor:
                        "in that repo on your local machine. Make sure you have read the code there to avoid "
                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a image processor with custom "
-                        "code to ensure no malicious code has been contributed in a newer revision."
-                    )
-
-                module_file, class_name = image_processor_auto_map.split(".")
                image_processor_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                    image_processor_auto_map, pretrained_model_name_or_path, **kwargs
                )
-                image_processor_class.register_for_auto_class()
            else:
                image_processor_class = image_processor_class_from_name(image_processor_class)

--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -254,17 +254,10 @@ class AutoProcessor:
                        "in that repo on your local machine. Make sure you have read the code there to avoid "
                        "malicious use, then set the option `trust_remote_code=True` to remove this error."
                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a feature extractor with custom "
-                        "code to ensure no malicious code has been contributed in a newer revision."
-                    )

-                module_file, class_name = processor_auto_map.split(".")
                processor_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
+                    processor_auto_map, pretrained_model_name_or_path, **kwargs
                )
-                processor_class.register_for_auto_class()
            else:
                processor_class = processor_class_from_name(processor_class)

--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -671,22 +671,12 @@ class AutoTokenizer:
                        " repo on your local machine. Make sure you have read the code there to avoid malicious use,"
                        " then set the option `trust_remote_code=True` to remove this error."
                    )
-                if kwargs.get("revision", None) is None:
-                    logger.warning(
-                        "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure"
-                        " no malicious code has been contributed in a newer revision."
-                    )

                if use_fast and tokenizer_auto_map[1] is not None:
                    class_ref = tokenizer_auto_map[1]
                else:
                    class_ref = tokenizer_auto_map[0]
-
-                module_file, class_name = class_ref.split(".")
-                tokenizer_class = get_class_from_dynamic_module(
-                    pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
-                )
-                tokenizer_class.register_for_auto_class()
+                tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)

            elif use_fast and not config_tokenizer_class.endswith("Fast"):
                tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@@ -727,9 +727,8 @@ def pipeline(
                    " set the option `trust_remote_code=True` to remove this error."
                )
            class_ref = targeted_task["impl"]
-            module_file, class_name = class_ref.split(".")
            pipeline_class = get_class_from_dynamic_module(
-                model, module_file + ".py", class_name, revision=revision, use_auth_token=use_auth_token
+                class_ref, model, revision=revision, use_auth_token=use_auth_token
            )
    else:
        normalized_task, targeted_task, task_options = check_task(task)
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -40,6 +40,7 @@ from .utils import (
    PushToHubMixin,
    TensorType,
    add_end_docstrings,
+    add_model_info_to_auto_map,
    cached_file,
    copy_func,
    download_url,
@@ -1817,6 +1818,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            cache_dir=cache_dir,
            local_files_only=local_files_only,
            _commit_hash=commit_hash,
+            _is_local=is_local,
            **kwargs,
        )

@@ -1831,6 +1833,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        cache_dir=None,
        local_files_only=False,
        _commit_hash=None,
+        _is_local=False,
        **kwargs,
    ):
        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
@@ -1861,7 +1864,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
            config_tokenizer_class = init_kwargs.get("tokenizer_class")
            init_kwargs.pop("tokenizer_class", None)
-            init_kwargs.pop("auto_map", None)
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
            if not init_inputs:
                init_inputs = saved_init_inputs
@@ -1869,6 +1871,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            config_tokenizer_class = None
            init_kwargs = init_configuration

+        if "auto_map" in init_kwargs and not _is_local:
+            # For backward compatibility with odl format.
+            if isinstance(init_kwargs["auto_map"], (tuple, list)):
+                init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
+            init_kwargs["auto_map"] = add_model_info_to_auto_map(
+                init_kwargs["auto_map"], pretrained_model_name_or_path
+            )
+
        if config_tokenizer_class is None:
            from .models.auto.configuration_auto import AutoConfig  # tests_ignore

--- a/src/transformers/utils/init.py
+++ b/src/transformers/utils/init.py
@@ -33,6 +33,7 @@ from .generic import (
    ModelOutput,
    PaddingStrategy,
    TensorType,
+    add_model_info_to_auto_map,
    cached_property,
    can_return_loss,
    expand_dims,
@@ -83,6 +84,7 @@ from .hub import (
    is_remote_url,
    move_cache,
    send_example_telemetry,
+    try_to_load_from_cache,
 )
 from .import_utils import (
    ENV_VARS_TRUE_AND_AUTO_VALUES,
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -535,3 +535,16 @@ def tensor_size(array):
        return array.size
    else:
        raise ValueError(f"Type not supported for expand_dims: {type(array)}.")
+
+
+def add_model_info_to_auto_map(auto_map, repo_id):
+    """
+    Adds the information of the repo_id to a given auto map.
+    """
+    for key, value in auto_map.items():
+        if isinstance(value, (tuple, list)):
+            auto_map[key] = [f"{repo_id}--{v}" if "--" not in v else v for v in value]
+        else:
+            auto_map[key] = f"{repo_id}--{value}" if "--" not in value else value
+
+    return auto_map
--- a/tests/models/auto/test_modeling_auto.py
+++ b/tests/models/auto/test_modeling_auto.py
@@ -298,6 +298,34 @@ class AutoModelTest(unittest.TestCase):
        for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
            self.assertTrue(torch.equal(p1, p2))

+    def test_from_pretrained_dynamic_model_distant_with_ref(self):
+        model = AutoModel.from_pretrained("hf-internal-testing/ref_to_test_dynamic_model", trust_remote_code=True)
+        self.assertEqual(model.__class__.__name__, "NewModel")
+
+        # Test model can be reloaded.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
+
+        self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
+        for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
+        # This one uses a relative import to a util file, this checks it is downloaded and used properly.
+        model = AutoModel.from_pretrained(
+            "hf-internal-testing/ref_to_test_dynamic_model_with_util", trust_remote_code=True
+        )
+        self.assertEqual(model.__class__.__name__, "NewModel")
+
+        # Test model can be reloaded.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
+
+        self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
+        for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
+            self.assertTrue(torch.equal(p1, p2))
+
    def test_new_model_registration(self):
        AutoConfig.register("custom", CustomConfig)