From 0b92ae3489577ea5a1ced167f7b60ac243f333a1 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Thu, 27 Jul 2023 16:35:17 +0200
Subject: [PATCH] Add offload support to Bark (#25037)

* initial Bark offload proposal

* use hooks instead of manually offloading

* add test of bark offload to cpu feature

* Apply nit suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update docstrings of offload

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* remove unecessary set_seed in Bark tests

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/bark/modeling_bark.py | 107 ++++++++++++++++--
 tests/models/bark/test_modeling_bark.py       |  41 ++++++-
 2 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index ae70a91584..1cc42bc811 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -23,8 +23,13 @@ from torch.nn import functional as F
 
 from ...generation.logits_process import AlternatingCodebooksLogitsProcessor, SuppressTokensLogitsProcessor
 from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...modeling_utils import PreTrainedModel, get_parameter_device
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_accelerate_available,
+    logging,
+)
 from ..auto import AutoModel
 from .configuration_bark import (
     BarkCoarseConfig,
@@ -288,6 +293,26 @@ class BarkPreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+
+        return get_parameter_device(self)
+
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, BarkCausalModel) or isinstance(module, BarkFineModel) or isinstance(module, BarkModel):
             module.gradient_checkpointing = value
@@ -1376,6 +1401,63 @@ class BarkModel(BarkPreTrainedModel):
 
         self.config = config
 
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        # for bark_model, device must be verified on its sub-models
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self.semantic, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.semantic.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+
+    def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
+        r"""
+        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
+        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
+        the next sub-model runs.
+
+        Args:
+            gpu_id (`int`, *optional*, defaults to 0):
+                GPU id on which the sub-models will be loaded and offloaded.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu")
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
+        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
+
+        hook = None
+        for cpu_offloaded_model in [
+            self.semantic,
+            self.coarse_acoustics,
+            self.fine_acoustics,
+        ]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        self.fine_acoustics_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.codec_model_hook = hook
+
     def codec_decode(self, fine_output):
         """Turn quantized audio codes into audio array using encodec."""
 
@@ -1402,13 +1484,13 @@ class BarkModel(BarkPreTrainedModel):
                 longest generation among the batch.
             history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                 Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
-        kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
+            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
 
-            - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
-            - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
-              semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
+                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
+                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
 
-            This means you can, for example, specify a generation strategy for all sub-models except one.
+                This means you can, for example, specify a generation strategy for all sub-models except one.
         Returns:
             torch.LongTensor: Output generated audio.
 
@@ -1490,9 +1572,20 @@ class BarkModel(BarkPreTrainedModel):
             **kwargs_fine,
         )
 
+        if getattr(self, "fine_acoustics_hook", None) is not None:
+            # Manually offload fine_acoustics to CPU
+            # and load codec_model to GPU
+            # since bark doesn't use codec_model forward pass
+            self.fine_acoustics_hook.offload()
+            self.codec_model = self.codec_model.to(self.device)
+
         # 4. Decode the output and generate audio array
         audio = self.codec_decode(output)
 
+        if getattr(self, "codec_model_hook", None) is not None:
+            # Offload codec_model to CPU
+            self.codec_model_hook.offload()
+
         return audio
 
     def can_generate(self) -> bool:
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index e2826fcfa2..4141e5c188 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -31,7 +31,7 @@ from transformers.models.bark.generation_configuration_bark import (
     BarkFineGenerationConfig,
     BarkSemanticGenerationConfig,
 )
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -989,3 +989,42 @@ class BarkModelIntegrationTests(unittest.TestCase):
                 coarse_temperature=0.2,
                 fine_temperature=0.1,
             )
+
+    @require_torch_gpu
+    @slow
+    def test_generate_end_to_end_with_offload(self):
+        input_ids = self.inputs
+
+        with torch.no_grad():
+            # standard generation
+            output_with_no_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
+
+            torch.cuda.empty_cache()
+
+            memory_before_offload = torch.cuda.memory_allocated()
+            model_memory_footprint = self.model.get_memory_footprint()
+
+            # activate cpu offload
+            self.model.enable_cpu_offload()
+
+            memory_after_offload = torch.cuda.memory_allocated()
+
+            # checks if the model have been offloaded
+
+            # CUDA memory usage after offload should be near 0, leaving room to small differences
+            room_for_difference = 1.1
+            self.assertGreater(
+                (memory_before_offload - model_memory_footprint) * room_for_difference, memory_after_offload
+            )
+
+            # checks if device is the correct one
+            self.assertEqual(self.model.device.type, torch_device)
+
+            # checks if hooks exist
+            self.assertTrue(hasattr(self.model.semantic, "_hf_hook"))
+
+            # output with cpu offload
+            output_with_offload = self.model.generate(**input_ids, do_sample=False, fine_temperature=None)
+
+        # checks if same output
+        self.assertListEqual(output_with_no_offload.tolist(), output_with_offload.tolist())