From 8c5c180de095e7d9974f12a4c6738b2235424e5d Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Fri, 5 Jul 2024 08:07:07 +0200
Subject: [PATCH] Fix serialization for offloaded model (#31727)

* Fix serialization

* style

* add test
---
 src/transformers/modeling_utils.py | 18 +++++++++---------
 tests/utils/test_modeling_utils.py | 23 ++++++++++++++++++++---
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c991c1c95b..399cdbf89d 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -2518,9 +2518,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 
         # Save the model
         if state_dict is None:
-            # if any model parameters are offloaded to the disk, make module map
-            if hasattr(self, "hf_device_map") and (
-                "cpu" in self.hf_device_map.values() or "disk" in self.hf_device_map.values()
+            # if any model parameters are offloaded, make module map
+            if (
+                hasattr(self, "hf_device_map")
+                and len(set(self.hf_device_map.values())) > 1
+                and ("cpu" in self.hf_device_map.values() or "disk" in self.hf_device_map.values())
             ):
                 warnings.warn(
                     "Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (5GB default)"
@@ -2532,7 +2534,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 
                     for key in module_state_dict:
                         module_map[name + f".{key}"] = module
-
             state_dict = model_to_save.state_dict()
 
         # Translate state_dict from smp to hf if saving with smp >= 1.10
@@ -2655,7 +2656,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                 and reg.fullmatch(filename_no_suffix) is not None
             ):
                 os.remove(full_filename)
-
         # Save the model
         for shard_file, tensors in state_dict_split.filename_to_tensors.items():
             shard = {tensor: state_dict[tensor] for tensor in tensors}
@@ -2667,15 +2667,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                         f"Please upgrade accelerate with `pip install -U accelerate`"
                     )
                 # init state_dict for this shard
-                state_dict = {name: "" for name in shard}
+                shard_state_dict = {name: "" for name in shard}
                 for module_name in shard:
                     module = module_map[module_name]
                     # update state dict with onloaded parameters
-                    state_dict = get_state_dict_from_offload(module, module_name, state_dict)
+                    shard_state_dict = get_state_dict_from_offload(module, module_name, shard_state_dict)
 
                 # assign shard to be the completed state dict
-                shard = state_dict
-                del state_dict
+                shard = shard_state_dict
+                del shard_state_dict
                 gc.collect()
 
             if safe_serialization:
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index ba70983f40..134b4758e6 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1065,6 +1065,23 @@ class ModelUtilsTest(TestCasePlus):
             # This check we did call the fake head request
             mock_head.assert_called()
 
+    @require_accelerate
+    @mark.accelerate_tests
+    def test_save_model_with_device_map_cpu(self):
+        model_id = "hf-internal-testing/tiny-random-gpt2"
+        inputs = torch.tensor([[1, 2, 3]])
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
+            output = model(inputs)[0]
+            model.save_pretrained(
+                tmp_dir, max_shard_size="200KB"
+            )  # model is 1.6MB, max shard size is allocated to cpu by default
+            saved_model = AutoModelForCausalLM.from_pretrained(tmp_dir, device_map="cpu")
+            saved_model_output = saved_model(inputs)[0]
+
+        self.assertTrue(torch.allclose(output, saved_model_output))
+
     @require_accelerate
     @mark.accelerate_tests
     @require_torch_accelerator
@@ -1083,9 +1100,9 @@ class ModelUtilsTest(TestCasePlus):
 
         # check_models_equal requires onloaded tensors
         model_id = "hf-internal-testing/tiny-random-gpt2"
-        onloaded_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
+        onloaded_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu").to(f"{torch_device}:0")
         inputs = torch.tensor([[1, 2, 3]]).to(f"{torch_device}:0")
-        cpu_output = onloaded_model(inputs)[0]
+        output = onloaded_model(inputs)[0]
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             offload_folder = os.path.join(tmp_dir, "offload")
@@ -1099,7 +1116,7 @@ class ModelUtilsTest(TestCasePlus):
             saved_model = AutoModelForCausalLM.from_pretrained(tmp_dir, device_map=device_map)
             postsaved_output = saved_model(inputs)[0]
 
-        self.assertTrue(torch.allclose(cpu_output, presaved_output, atol=1e-4))
+        self.assertTrue(torch.allclose(output, presaved_output, atol=1e-4))
         self.assertTrue(torch.allclose(presaved_output, postsaved_output))
 
     @require_safetensors