From d0b65bb4797dc11d1d9dc7b9f66e2b6bd5b47ca5 Mon Sep 17 00:00:00 2001
From: huismiling <zh_smiling@yeah.net>
Date: Mon, 31 Mar 2025 17:02:49 +0800
Subject: [PATCH] [MLU] Fix FA2 check error, remove deepspeed-mlu deps.
 (#36159)

* add Cambricon MLUs support

* fix mlu device rng state

* up for quality check

* up mlu to support fp16

* fix mlu device dependency error

* fix mlu device dependency error

* enable mlu device for bf16

* fix mlu device memory tracker

* Cambricon support SDPA and flash_attn

* MLU devices : Checks if `mlu` is available via an `cndev-based` check which won't trigger the drivers and leave mlu

* Fix mlu FA2 check. Remove deepspeed-mlu check. add mlu tests support.

* fix testing errors.

* Merge branch 'hf/main' into main

* fix get_device_count error.

* fix mlu testing utils.

* fix code quality and style.

* switch to @require_torch_multi_accelerator
---
 src/transformers/integrations/deepspeed.py |  5 +--
 src/transformers/modeling_utils.py         |  8 ++++-
 src/transformers/testing_utils.py          | 30 +++++++++++++++--
 tests/generation/test_fsdp.py              | 38 ++++++++++++++++------
 4 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
index bf47511345..696e284b74 100644
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@@ -22,7 +22,7 @@ import weakref
 from functools import partialmethod
 
 from ..dependency_versions_check import dep_version_check
-from ..utils import is_accelerate_available, is_torch_available, is_torch_mlu_available, logging
+from ..utils import is_accelerate_available, is_torch_available, logging
 
 
 if is_torch_available():
@@ -40,9 +40,6 @@ def is_deepspeed_available():
     # AND checking it has an author field in the metadata that is HuggingFace.
     if package_exists:
         try:
-            if is_torch_mlu_available():
-                _ = importlib_metadata.metadata("deepspeed-mlu")
-                return True
             _ = importlib_metadata.metadata("deepspeed")
             return True
         except importlib_metadata.PackageNotFoundError:
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5b27f0c3db..c279ae391c 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -103,6 +103,7 @@ from .utils import (
     is_safetensors_available,
     is_torch_flex_attn_available,
     is_torch_greater_or_equal,
+    is_torch_mlu_available,
     is_torch_npu_available,
     is_torch_sdpa_available,
     is_torch_xla_available,
@@ -2323,12 +2324,17 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 
         # The check `torch.empty(0).device.type != "cuda"` is needed as the model may be initialized after `torch.set_default_device` has been called,
         # or the model may be initialized under the context manager `with torch.device("cuda"):`.
-        if check_device_map and device_map is None and torch.empty(0).device.type != "cuda":
+        if check_device_map and device_map is None and torch.empty(0).device.type not in ["cuda", "mlu"]:
             if torch.cuda.is_available():
                 logger.warning_once(
                     "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU"
                     " after initializing it on CPU with `model.to('cuda')`."
                 )
+            elif is_torch_mlu_available():
+                logger.warning_once(
+                    "You are attempting to use Flash Attention 2.0 with a model not initialized on MLU. Make sure to move the model to MLU"
+                    " after initializing it on CPU with `model.to('mlu')`."
+                )
             else:
                 raise ValueError(
                     "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU and with no GPU available. "
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 1517be21ed..8a92a9211c 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -144,6 +144,7 @@ from .utils import (
     is_torch_fp16_available_on_device,
     is_torch_greater_or_equal,
     is_torch_hpu_available,
+    is_torch_mlu_available,
     is_torch_neuroncore_available,
     is_torch_npu_available,
     is_torch_sdpa_available,
@@ -940,6 +941,10 @@ if is_torch_available():
             raise ValueError(
                 f"TRANSFORMERS_TEST_DEVICE={torch_device}, but NPU is unavailable. Please double-check your testing environment."
             )
+        if torch_device == "mlu" and not is_torch_mlu_available():
+            raise ValueError(
+                f"TRANSFORMERS_TEST_DEVICE={torch_device}, but MLU is unavailable. Please double-check your testing environment."
+            )
         if torch_device == "hpu" and not is_torch_hpu_available():
             raise ValueError(
                 f"TRANSFORMERS_TEST_DEVICE={torch_device}, but HPU is unavailable. Please double-check your testing environment."
@@ -956,6 +961,8 @@ if is_torch_available():
         torch_device = "cuda"
     elif _run_third_party_device_tests and is_torch_npu_available():
         torch_device = "npu"
+    elif _run_third_party_device_tests and is_torch_mlu_available():
+        torch_device = "mlu"
     elif _run_third_party_device_tests and is_torch_hpu_available():
         torch_device = "hpu"
     elif _run_third_party_device_tests and is_torch_xpu_available():
@@ -2927,9 +2934,21 @@ def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable],
 if is_torch_available():
     # Mappings from device names to callable functions to support device agnostic
     # testing.
-    BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed}
-    BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "default": None}
-    BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "default": lambda: 1}
+    BACKEND_MANUAL_SEED = {
+        "cuda": torch.cuda.manual_seed,
+        "cpu": torch.manual_seed,
+        "default": torch.manual_seed,
+    }
+    BACKEND_EMPTY_CACHE = {
+        "cuda": torch.cuda.empty_cache,
+        "cpu": None,
+        "default": None,
+    }
+    BACKEND_DEVICE_COUNT = {
+        "cuda": torch.cuda.device_count,
+        "cpu": lambda: 0,
+        "default": lambda: 1,
+    }
 else:
     BACKEND_MANUAL_SEED = {"default": None}
     BACKEND_EMPTY_CACHE = {"default": None}
@@ -2939,6 +2958,11 @@ if is_torch_hpu_available():
     BACKEND_MANUAL_SEED["hpu"] = torch.hpu.manual_seed
     BACKEND_DEVICE_COUNT["hpu"] = torch.hpu.device_count
 
+if is_torch_mlu_available():
+    BACKEND_EMPTY_CACHE["mlu"] = torch.mlu.empty_cache
+    BACKEND_MANUAL_SEED["mlu"] = torch.mlu.manual_seed
+    BACKEND_DEVICE_COUNT["mlu"] = torch.mlu.device_count
+
 if is_torch_npu_available():
     BACKEND_EMPTY_CACHE["npu"] = torch.npu.empty_cache
     BACKEND_MANUAL_SEED["npu"] = torch.npu.manual_seed
diff --git a/tests/generation/test_fsdp.py b/tests/generation/test_fsdp.py
index 904ccdea63..2f4c77078f 100644
--- a/tests/generation/test_fsdp.py
+++ b/tests/generation/test_fsdp.py
@@ -15,12 +15,12 @@
 import argparse
 from typing import Any, Callable
 
-from transformers import is_torch_available
+from transformers import is_torch_available, is_torch_mlu_available
 from transformers.testing_utils import (
     TestCasePlus,
     execute_subprocess_async,
     get_torch_dist_unique_port,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
 )
 
 
@@ -46,7 +46,11 @@ if is_torch_available():
         """Manage the creation and destruction of the distributed process group for the wrapped function."""
 
         def wrapped(*args: Any, **kwargs: Any) -> Any:
-            torch.distributed.init_process_group(world_size=torch.cuda.device_count())
+            if is_torch_mlu_available():
+                device_count = torch.mlu.device_count()
+            else:
+                device_count = torch.cuda.device_count()
+            torch.distributed.init_process_group(world_size=device_count)
             try:
                 return func(*args, **kwargs)
             finally:
@@ -56,7 +60,10 @@ if is_torch_available():
 
     @manage_process_group
     def fsdp_generate():
-        torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        if is_torch_mlu_available():
+            torch.mlu.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        else:
+            torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
 
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
 
@@ -79,11 +86,14 @@ if is_torch_available():
 
     @manage_process_group
     def fsdp2_generate():
-        torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        if is_torch_mlu_available():
+            torch.mlu.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        else:
+            torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
 
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
 
-        mesh = init_device_mesh("cuda", (torch.distributed.get_world_size(),))
+        mesh = init_device_mesh(device.type, (torch.distributed.get_world_size(),))
         for submodule in model.modules():
             if isinstance(submodule, GPT2Block):
                 fully_shard(submodule, mesh=mesh)
@@ -102,9 +112,13 @@ if is_torch_available():
 
 
 class TestFSDPGeneration(TestCasePlus):
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_fsdp_generate(self):
-        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+        if is_torch_mlu_available():
+            device_count = torch.mlu.device_count()
+        else:
+            device_count = torch.cuda.device_count()
+        distributed_args = f"""--nproc_per_node={device_count}
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_fsdp.py
         """.split()
@@ -113,9 +127,13 @@ class TestFSDPGeneration(TestCasePlus):
         execute_subprocess_async(cmd, env=self.get_env())
         # successful return here == success - any errors would have caused an error in the sub-call
 
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_fsdp2_generate(self):
-        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+        if is_torch_mlu_available():
+            device_count = torch.mlu.device_count()
+        else:
+            device_count = torch.cuda.device_count()
+        distributed_args = f"""--nproc_per_node={device_count}
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_fsdp.py
         """.split()