From 9c875839c0800c55c23f651ec4a1821d0c20ca04 Mon Sep 17 00:00:00 2001 From: statelesshz <3140102143@zju.edu.cn> Date: Tue, 18 Jul 2023 20:20:32 +0800 Subject: [PATCH] add ascend npu accelerator support (#24879) * Add Ascend NPU accelerator support * fix style warining --- src/transformers/__init__.py | 2 ++ src/transformers/testing_utils.py | 21 +++++++++++++++++++++ src/transformers/trainer_utils.py | 3 +++ src/transformers/training_args.py | 8 +++++++- src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 19 +++++++++++++++++++ tests/trainer/test_trainer_distributed.py | 15 +++++++++++++++ 7 files changed, 68 insertions(+), 1 deletion(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 80eb14588f..409c081225 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -721,6 +721,7 @@ _import_structure = { "is_tokenizers_available", "is_torch_available", "is_torch_neuroncore_available", + "is_torch_npu_available", "is_torch_tpu_available", "is_torchvision_available", "is_vision_available", @@ -4643,6 +4644,7 @@ if TYPE_CHECKING: is_tokenizers_available, is_torch_available, is_torch_neuroncore_available, + is_torch_npu_available, is_torch_tpu_available, is_torchvision_available, is_vision_available, diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 698327f658..a9ab304d2a 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -91,6 +91,7 @@ from .utils import ( is_torch_bf16_cpu_available, is_torch_bf16_gpu_available, is_torch_neuroncore_available, + is_torch_npu_available, is_torch_tensorrt_fx_available, is_torch_tf32_available, is_torch_tpu_available, @@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case): ) +def require_torch_npu(test_case): + """ + Decorator marking a test that requires NPU (in PyTorch). + """ + return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case) + + +def require_torch_multi_npu(test_case): + """ + Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without + multiple NPUs. + + To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu" + """ + if not is_torch_npu_available(): + return unittest.skip("test requires PyTorch NPU")(test_case) + + return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case) + + if is_torch_available(): # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode import torch diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 74f01ad927..30571597c2 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -36,6 +36,7 @@ from .utils import ( is_torch_available, is_torch_cuda_available, is_torch_mps_available, + is_torch_npu_available, is_torch_tpu_available, requires_backends, ) @@ -94,6 +95,8 @@ def set_seed(seed: int): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # ^^ safe to call this function even if cuda is not available + if is_torch_npu_available(): + torch.npu.manual_seed_all(seed) if is_tf_available(): tf.random.set_seed(seed) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 74f6d929e0..3a30adbe0b 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -47,6 +47,7 @@ from .utils import ( is_torch_bf16_cpu_available, is_torch_bf16_gpu_available, is_torch_neuroncore_available, + is_torch_npu_available, is_torch_tf32_available, is_torch_tpu_available, logging, @@ -1368,12 +1369,13 @@ class TrainingArguments: self.framework == "pt" and is_torch_available() and (self.device.type != "cuda") + and (self.device.type != "npu") and (get_xla_device_type(self.device) != "GPU") and (self.fp16 or self.fp16_full_eval) ): raise ValueError( "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation" - " (`--fp16_full_eval`) can only be used on CUDA devices." + " (`--fp16_full_eval`) can only be used on CUDA or NPU devices." ) if ( @@ -1769,6 +1771,10 @@ class TrainingArguments: elif self.use_cpu: device = torch.device("cpu") self._n_gpu = 0 + elif is_torch_npu_available(): + device = torch.device("npu:0") + torch.npu.set_device(device) + self._n_gpu = 1 else: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 21430cd5ba..bca5440f8e 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -164,6 +164,7 @@ from .import_utils import ( is_torch_fx_proxy, is_torch_mps_available, is_torch_neuroncore_available, + is_torch_npu_available, is_torch_tensorrt_fx_available, is_torch_tf32_available, is_torch_tpu_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 83b6f01dfd..ffefaed9e6 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True): return False +@lru_cache() +def is_torch_npu_available(check_device=False): + "Checks if `torch_npu` is installed and potentially if a NPU is in the environment" + if not _torch_available or importlib.util.find_spec("torch_npu") is None: + return False + + import torch + import torch_npu # noqa: F401 + + if check_device: + try: + # Will raise a RuntimeError if no NPU is found + _ = torch.npu.device_count() + return torch.npu.is_available() + except RuntimeError: + return False + return hasattr(torch, "npu") and torch.npu.is_available() + + def is_torchdynamo_available(): if not is_torch_available(): return False diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py index 4078885a5d..8711a3970f 100644 --- a/tests/trainer/test_trainer_distributed.py +++ b/tests/trainer/test_trainer_distributed.py @@ -21,6 +21,7 @@ from transformers.testing_utils import ( get_torch_dist_unique_port, require_torch_multi_gpu, require_torch_neuroncore, + require_torch_npu, ) from transformers.training_args import ParallelMode from transformers.utils import logging @@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus): # successful return here == success - any errors would have caused an error in the sub-call +class TestTrainerDistributedNPU(TestCasePlus): + @require_torch_npu + def test_trainer(self): + distributed_args = f"""--nproc_per_node=2 + --master_port={get_torch_dist_unique_port()} + {self.test_file_dir}/test_trainer_distributed.py + """.split() + output_dir = self.get_auto_remove_tmp_dir() + args = f"--output_dir {output_dir}".split() + cmd = ["torchrun"] + distributed_args + args + execute_subprocess_async(cmd, env=self.get_env()) + # successful return here == success - any errors would have caused an error in the sub-call + + class TestTrainerDistributed(TestCasePlus): @require_torch_multi_gpu def test_trainer(self):