add ascend npu accelerator support (#24879)

* Add Ascend NPU accelerator support

* fix style warining
This commit is contained in:
statelesshz
2023-07-18 20:20:32 +08:00
committed by GitHub
parent f14c7f999d
commit 9c875839c0
7 changed files with 68 additions and 1 deletions

View File

@@ -721,6 +721,7 @@ _import_structure = {
"is_tokenizers_available", "is_tokenizers_available",
"is_torch_available", "is_torch_available",
"is_torch_neuroncore_available", "is_torch_neuroncore_available",
"is_torch_npu_available",
"is_torch_tpu_available", "is_torch_tpu_available",
"is_torchvision_available", "is_torchvision_available",
"is_vision_available", "is_vision_available",
@@ -4643,6 +4644,7 @@ if TYPE_CHECKING:
is_tokenizers_available, is_tokenizers_available,
is_torch_available, is_torch_available,
is_torch_neuroncore_available, is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tpu_available, is_torch_tpu_available,
is_torchvision_available, is_torchvision_available,
is_vision_available, is_vision_available,

View File

@@ -91,6 +91,7 @@ from .utils import (
is_torch_bf16_cpu_available, is_torch_bf16_cpu_available,
is_torch_bf16_gpu_available, is_torch_bf16_gpu_available,
is_torch_neuroncore_available, is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tensorrt_fx_available, is_torch_tensorrt_fx_available,
is_torch_tf32_available, is_torch_tf32_available,
is_torch_tpu_available, is_torch_tpu_available,
@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
) )
def require_torch_npu(test_case):
"""
Decorator marking a test that requires NPU (in PyTorch).
"""
return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
def require_torch_multi_npu(test_case):
"""
Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
multiple NPUs.
To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
"""
if not is_torch_npu_available():
return unittest.skip("test requires PyTorch NPU")(test_case)
return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
if is_torch_available(): if is_torch_available():
# Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
import torch import torch

View File

@@ -36,6 +36,7 @@ from .utils import (
is_torch_available, is_torch_available,
is_torch_cuda_available, is_torch_cuda_available,
is_torch_mps_available, is_torch_mps_available,
is_torch_npu_available,
is_torch_tpu_available, is_torch_tpu_available,
requires_backends, requires_backends,
) )
@@ -94,6 +95,8 @@ def set_seed(seed: int):
torch.manual_seed(seed) torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed) torch.cuda.manual_seed_all(seed)
# ^^ safe to call this function even if cuda is not available # ^^ safe to call this function even if cuda is not available
if is_torch_npu_available():
torch.npu.manual_seed_all(seed)
if is_tf_available(): if is_tf_available():
tf.random.set_seed(seed) tf.random.set_seed(seed)

View File

@@ -47,6 +47,7 @@ from .utils import (
is_torch_bf16_cpu_available, is_torch_bf16_cpu_available,
is_torch_bf16_gpu_available, is_torch_bf16_gpu_available,
is_torch_neuroncore_available, is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tf32_available, is_torch_tf32_available,
is_torch_tpu_available, is_torch_tpu_available,
logging, logging,
@@ -1368,12 +1369,13 @@ class TrainingArguments:
self.framework == "pt" self.framework == "pt"
and is_torch_available() and is_torch_available()
and (self.device.type != "cuda") and (self.device.type != "cuda")
and (self.device.type != "npu")
and (get_xla_device_type(self.device) != "GPU") and (get_xla_device_type(self.device) != "GPU")
and (self.fp16 or self.fp16_full_eval) and (self.fp16 or self.fp16_full_eval)
): ):
raise ValueError( raise ValueError(
"FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation" "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
" (`--fp16_full_eval`) can only be used on CUDA devices." " (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
) )
if ( if (
@@ -1769,6 +1771,10 @@ class TrainingArguments:
elif self.use_cpu: elif self.use_cpu:
device = torch.device("cpu") device = torch.device("cpu")
self._n_gpu = 0 self._n_gpu = 0
elif is_torch_npu_available():
device = torch.device("npu:0")
torch.npu.set_device(device)
self._n_gpu = 1
else: else:
# if n_gpu is > 1 we'll use nn.DataParallel. # if n_gpu is > 1 we'll use nn.DataParallel.
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`

View File

@@ -164,6 +164,7 @@ from .import_utils import (
is_torch_fx_proxy, is_torch_fx_proxy,
is_torch_mps_available, is_torch_mps_available,
is_torch_neuroncore_available, is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tensorrt_fx_available, is_torch_tensorrt_fx_available,
is_torch_tf32_available, is_torch_tf32_available,
is_torch_tpu_available, is_torch_tpu_available,

View File

@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
return False return False
@lru_cache()
def is_torch_npu_available(check_device=False):
"Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
if not _torch_available or importlib.util.find_spec("torch_npu") is None:
return False
import torch
import torch_npu # noqa: F401
if check_device:
try:
# Will raise a RuntimeError if no NPU is found
_ = torch.npu.device_count()
return torch.npu.is_available()
except RuntimeError:
return False
return hasattr(torch, "npu") and torch.npu.is_available()
def is_torchdynamo_available(): def is_torchdynamo_available():
if not is_torch_available(): if not is_torch_available():
return False return False

View File

@@ -21,6 +21,7 @@ from transformers.testing_utils import (
get_torch_dist_unique_port, get_torch_dist_unique_port,
require_torch_multi_gpu, require_torch_multi_gpu,
require_torch_neuroncore, require_torch_neuroncore,
require_torch_npu,
) )
from transformers.training_args import ParallelMode from transformers.training_args import ParallelMode
from transformers.utils import logging from transformers.utils import logging
@@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
# successful return here == success - any errors would have caused an error in the sub-call # successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributedNPU(TestCasePlus):
@require_torch_npu
def test_trainer(self):
distributed_args = f"""--nproc_per_node=2
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributed(TestCasePlus): class TestTrainerDistributed(TestCasePlus):
@require_torch_multi_gpu @require_torch_multi_gpu
def test_trainer(self): def test_trainer(self):