[core] Large/full refactor of from_pretrained (#36033)

* squash everything together start to simplify inner logic Update modeling_utils.py Update modeling_utils.py Update modeling_utils.py Update modeling_utils.py continue refactor fix small fixes add type hints/docstring Update modeling_utils.py remove _fast_init keep improving Update modeling_utils.py Update modeling_utils.py new first tp loading version style fix weird in-place op trigger CIs Update modeling_utils.py much clearer renaming of keys fix update Update test_modeling_common.py trigger CIs update update style Update modeling_utils.py Update modeling_utils.py Update modeling_utils.py fix fast download first prototype remove old function remove old functions Remove unused function and move back _get_tp_registry fix tp plan registry simplify CIs Update hub.py Update modeling_utils.py simplify simplify renaming logic remove unused check add sanity check back (a test depends on it) Update modeling_utils.py finalize sound renaming logic style add forgotten check Update modeling_utils.py add key_mapping keyword style Update modeling_utils.py add comment minor updates minor change for clarity fix small prefix issue and simplify style trigger CIs typo fix Post rebase fix post rebase cleanup simplify tp typo oupsi typo correctly escape improvements based on Marc's review finalize Marc's review comments squash everything * improve * Update modeling_utils.py * Update modeling_utils.py * fix * Update modeling_utils.py * Update modeling_utils.py * style * Update modeling_utils.py * simplify * style * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * fix dtype issue * Update modeling_utils.py * style * remove test that does not make sense * style * small fixes * style * fix * cleanup after rebase * style * typo * escape * tp for task specific top modules * Update modeling_utils.py * Update modeling_utils.py * fix allocation * CIs * CIs * CIs * improve docstring * CIs * Update modeling_utils.py * fix
2025-03-12 13:39:25 +01:00
parent 7652804d23
commit 071a161d3e
15 changed files with 1525 additions and 1542 deletions
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2368,10 +2368,9 @@ class ModelTesterMixin:
                safe_save_file(placeholder_dict, os.path.join(tmp_dir, "model.safetensors"), metadata={"format": "pt"})
                model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True)

-                prefix = f"{model_reloaded.base_model_prefix}."
                params = dict(model_reloaded.named_parameters())
                params.update(dict(model_reloaded.named_buffers()))
-                param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()}
+                param_names = set(params.keys())

                missing_keys = set(infos["missing_keys"])

@@ -2383,9 +2382,8 @@ class ModelTesterMixin:
                    ptrs[id_tensor_storage(tensor)].append(name)
                tied_params = [names for _, names in ptrs.items() if len(names) > 1]
                for group in tied_params:
-                    group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group}
                    # We remove the group from extra_missing if not all weights from group are in it
-                    if len(group - extra_missing) > 0:
+                    if len(set(group) - extra_missing) > 0:
                        extra_missing = extra_missing - set(group)

                self.assertEqual(
@@ -2399,15 +2397,14 @@ class ModelTesterMixin:
                # Remove nonpersistent buffers from missed_missing
                buffers = [n for n, _ in model_reloaded.named_buffers()]
                nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()}
-                nonpersistent_buffers = {
-                    k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers
-                }
                missed_missing = missed_missing - nonpersistent_buffers

                if model_reloaded._keys_to_ignore_on_load_missing is None:
                    expected_missing = set()
                else:
-                    expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing)
+                    expected_missing = set()
+                    for pattern in model_reloaded._keys_to_ignore_on_load_missing:
+                        expected_missing.update({k for k in param_names if re.search(pattern, k) is not None})
                self.assertEqual(
                    missed_missing,
                    expected_missing,
--- a/tests/utils/test_hub_utils.py
+++ b/tests/utils/test_hub_utils.py
@@ -28,7 +28,6 @@ from transformers.utils import (
    TRANSFORMERS_CACHE,
    WEIGHTS_NAME,
    cached_file,
-    get_file_from_repo,
    has_file,
 )

@@ -87,14 +86,8 @@ class GetFromCacheTests(unittest.TestCase):
        path = cached_file(RANDOM_BERT, "conf", local_files_only=True, _raise_exceptions_for_missing_entries=False)
        self.assertIsNone(path)

-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+        # Under the mock environment, hf_hub_download will always raise an HTTPError
+        with mock.patch("transformers.utils.hub.hf_hub_download", side_effect=HTTPError) as mock_head:
            path = cached_file(RANDOM_BERT, "conf", _raise_exceptions_for_connection_errors=False)
            self.assertIsNone(path)
            # This check we did call the fake head request
@@ -117,18 +110,45 @@ class GetFromCacheTests(unittest.TestCase):
            assert has_file(TINY_BERT_PT_ONLY, WEIGHTS_NAME, local_files_only=True, cache_dir=tmp_dir)

    def test_get_file_from_repo_distant(self):
-        # `get_file_from_repo` returns None if the file does not exist
-        self.assertIsNone(get_file_from_repo("google-bert/bert-base-cased", "ahah.txt"))
+        # should return None if the file does not exist
+        self.assertIsNone(
+            cached_file(
+                "google-bert/bert-base-cased",
+                "ahah.txt",
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
+        )

        # The function raises if the repository does not exist.
        with self.assertRaisesRegex(EnvironmentError, "is not a valid model identifier"):
-            get_file_from_repo("bert-base-case", CONFIG_NAME)
+            cached_file(
+                "bert-base-case",
+                CONFIG_NAME,
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )

        # The function raises if the revision does not exist.
        with self.assertRaisesRegex(EnvironmentError, "is not a valid git identifier"):
-            get_file_from_repo("google-bert/bert-base-cased", CONFIG_NAME, revision="ahaha")
+            cached_file(
+                "google-bert/bert-base-cased",
+                CONFIG_NAME,
+                revision="ahaha",
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )

-        resolved_file = get_file_from_repo("google-bert/bert-base-cased", CONFIG_NAME)
+        resolved_file = cached_file(
+            "google-bert/bert-base-cased",
+            CONFIG_NAME,
+            _raise_exceptions_for_gated_repo=False,
+            _raise_exceptions_for_missing_entries=False,
+            _raise_exceptions_for_connection_errors=False,
+        )
        # The name is the cached name which is not very easy to test, so instead we load the content.
        config = json.loads(open(resolved_file, "r").read())
        self.assertEqual(config["hidden_size"], 768)
@@ -137,9 +157,26 @@ class GetFromCacheTests(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmp_dir:
            filename = Path(tmp_dir) / "a.txt"
            filename.touch()
-            self.assertEqual(get_file_from_repo(tmp_dir, "a.txt"), str(filename))
+            self.assertEqual(
+                cached_file(
+                    tmp_dir,
+                    "a.txt",
+                    _raise_exceptions_for_gated_repo=False,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                ),
+                str(filename),
+            )

-            self.assertIsNone(get_file_from_repo(tmp_dir, "b.txt"))
+            self.assertIsNone(
+                cached_file(
+                    tmp_dir,
+                    "b.txt",
+                    _raise_exceptions_for_gated_repo=False,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                )
+            )

    def test_get_file_gated_repo(self):
        """Test download file from a gated repo fails with correct message when not authenticated."""
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 import copy
 import glob
-import itertools
 import json
 import os
 import os.path
@@ -525,13 +524,12 @@ class ModelUtilsTest(TestCasePlus):
        self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float16)

-        # TODO @ARTHURZUCKER FIX THIS
        # but if the model has `_keep_in_fp32_modules` then those modules should be in fp32 no matter what
-        # LlavaForConditionalGeneration._keep_in_fp32_modules = ["multi_modal_projector"]
-        # model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, config=config, torch_dtype="auto")
-        # self.assertEqual(model.language_model.dtype, torch.float32)
-        # self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
-        # self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float32)
+        LlavaForConditionalGeneration._keep_in_fp32_modules = ["multi_modal_projector"]
+        model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, config=config, torch_dtype="auto")
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
+        self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float32)

        # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
        with self.assertRaises(ValueError):
@@ -540,20 +538,6 @@ class ModelUtilsTest(TestCasePlus):
                TINY_LLAVA, torch_dtype={"text_config": "float32", "vision_config": "int64", "": "float16"}
            )

-    @require_torch
-    @unittest.skip("Broken by @arthurzucker because the fix was not correct. Knowing the context is super hard")
-    def test_model_from_pretrained_meta_device(self):
-        def is_on_meta(model_id, dtype):
-            with torch.device("meta"):
-                model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype)
-                return all(value.device.type == "meta" for value in model.state_dict().values())
-
-        model_ids = ("fxmarty/tiny-llama-fast-tokenizer", "fxmarty/small-llama-testing")
-        dtypes = (None, "auto", torch.float16)
-
-        for model_id, dtype in itertools.product(model_ids, dtypes):
-            self.assertTrue(is_on_meta(model_id, dtype))
-
    def test_model_from_pretrained_torch_dtype(self):
        # test that the model can be instantiated with dtype of either
        # 1. explicit from_pretrained's torch_dtype argument