From 9c875839c0800c55c23f651ec4a1821d0c20ca04 Mon Sep 17 00:00:00 2001
From: statelesshz <3140102143@zju.edu.cn>
Date: Tue, 18 Jul 2023 20:20:32 +0800
Subject: [PATCH] add ascend npu accelerator support (#24879)

* Add Ascend NPU accelerator support

* fix style warining
---
 src/transformers/__init__.py              |  2 ++
 src/transformers/testing_utils.py         | 21 +++++++++++++++++++++
 src/transformers/trainer_utils.py         |  3 +++
 src/transformers/training_args.py         |  8 +++++++-
 src/transformers/utils/__init__.py        |  1 +
 src/transformers/utils/import_utils.py    | 19 +++++++++++++++++++
 tests/trainer/test_trainer_distributed.py | 15 +++++++++++++++
 7 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 80eb14588f..409c081225 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -721,6 +721,7 @@ _import_structure = {
         "is_tokenizers_available",
         "is_torch_available",
         "is_torch_neuroncore_available",
+        "is_torch_npu_available",
         "is_torch_tpu_available",
         "is_torchvision_available",
         "is_vision_available",
@@ -4643,6 +4644,7 @@ if TYPE_CHECKING:
         is_tokenizers_available,
         is_torch_available,
         is_torch_neuroncore_available,
+        is_torch_npu_available,
         is_torch_tpu_available,
         is_torchvision_available,
         is_vision_available,
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 698327f658..a9ab304d2a 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -91,6 +91,7 @@ from .utils import (
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
     is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
     )
 
 
+def require_torch_npu(test_case):
+    """
+    Decorator marking a test that requires NPU (in PyTorch).
+    """
+    return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
+
+
+def require_torch_multi_npu(test_case):
+    """
+    Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
+    multiple NPUs.
+
+    To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
+    """
+    if not is_torch_npu_available():
+        return unittest.skip("test requires PyTorch NPU")(test_case)
+
+    return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
+
+
 if is_torch_available():
     # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
     import torch
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 74f01ad927..30571597c2 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -36,6 +36,7 @@ from .utils import (
     is_torch_available,
     is_torch_cuda_available,
     is_torch_mps_available,
+    is_torch_npu_available,
     is_torch_tpu_available,
     requires_backends,
 )
@@ -94,6 +95,8 @@ def set_seed(seed: int):
         torch.manual_seed(seed)
         torch.cuda.manual_seed_all(seed)
         # ^^ safe to call this function even if cuda is not available
+    if is_torch_npu_available():
+        torch.npu.manual_seed_all(seed)
     if is_tf_available():
         tf.random.set_seed(seed)
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 74f6d929e0..3a30adbe0b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -47,6 +47,7 @@ from .utils import (
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
     is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
     logging,
@@ -1368,12 +1369,13 @@ class TrainingArguments:
             self.framework == "pt"
             and is_torch_available()
             and (self.device.type != "cuda")
+            and (self.device.type != "npu")
             and (get_xla_device_type(self.device) != "GPU")
             and (self.fp16 or self.fp16_full_eval)
         ):
             raise ValueError(
                 "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
-                " (`--fp16_full_eval`) can only be used on CUDA devices."
+                " (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
             )
 
         if (
@@ -1769,6 +1771,10 @@ class TrainingArguments:
             elif self.use_cpu:
                 device = torch.device("cpu")
                 self._n_gpu = 0
+            elif is_torch_npu_available():
+                device = torch.device("npu:0")
+                torch.npu.set_device(device)
+                self._n_gpu = 1
             else:
                 # if n_gpu is > 1 we'll use nn.DataParallel.
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 21430cd5ba..bca5440f8e 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -164,6 +164,7 @@ from .import_utils import (
     is_torch_fx_proxy,
     is_torch_mps_available,
     is_torch_neuroncore_available,
+    is_torch_npu_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 83b6f01dfd..ffefaed9e6 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
     return False
 
 
+@lru_cache()
+def is_torch_npu_available(check_device=False):
+    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
+        return False
+
+    import torch
+    import torch_npu  # noqa: F401
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no NPU is found
+            _ = torch.npu.device_count()
+            return torch.npu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "npu") and torch.npu.is_available()
+
+
 def is_torchdynamo_available():
     if not is_torch_available():
         return False
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 4078885a5d..8711a3970f 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -21,6 +21,7 @@ from transformers.testing_utils import (
     get_torch_dist_unique_port,
     require_torch_multi_gpu,
     require_torch_neuroncore,
+    require_torch_npu,
 )
 from transformers.training_args import ParallelMode
 from transformers.utils import logging
@@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
         # successful return here == success - any errors would have caused an error in the sub-call
 
 
+class TestTrainerDistributedNPU(TestCasePlus):
+    @require_torch_npu
+    def test_trainer(self):
+        distributed_args = f"""--nproc_per_node=2
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
 class TestTrainerDistributed(TestCasePlus):
     @require_torch_multi_gpu
     def test_trainer(self):