add ascend npu accelerator support (#24879)
* Add Ascend NPU accelerator support * fix style warining
This commit is contained in:
@@ -721,6 +721,7 @@ _import_structure = {
|
||||
"is_tokenizers_available",
|
||||
"is_torch_available",
|
||||
"is_torch_neuroncore_available",
|
||||
"is_torch_npu_available",
|
||||
"is_torch_tpu_available",
|
||||
"is_torchvision_available",
|
||||
"is_vision_available",
|
||||
@@ -4643,6 +4644,7 @@ if TYPE_CHECKING:
|
||||
is_tokenizers_available,
|
||||
is_torch_available,
|
||||
is_torch_neuroncore_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_tpu_available,
|
||||
is_torchvision_available,
|
||||
is_vision_available,
|
||||
|
||||
@@ -91,6 +91,7 @@ from .utils import (
|
||||
is_torch_bf16_cpu_available,
|
||||
is_torch_bf16_gpu_available,
|
||||
is_torch_neuroncore_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_tensorrt_fx_available,
|
||||
is_torch_tf32_available,
|
||||
is_torch_tpu_available,
|
||||
@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
|
||||
)
|
||||
|
||||
|
||||
def require_torch_npu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires NPU (in PyTorch).
|
||||
"""
|
||||
return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
|
||||
|
||||
|
||||
def require_torch_multi_npu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
|
||||
multiple NPUs.
|
||||
|
||||
To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
|
||||
"""
|
||||
if not is_torch_npu_available():
|
||||
return unittest.skip("test requires PyTorch NPU")(test_case)
|
||||
|
||||
return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
# Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
|
||||
import torch
|
||||
|
||||
@@ -36,6 +36,7 @@ from .utils import (
|
||||
is_torch_available,
|
||||
is_torch_cuda_available,
|
||||
is_torch_mps_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_tpu_available,
|
||||
requires_backends,
|
||||
)
|
||||
@@ -94,6 +95,8 @@ def set_seed(seed: int):
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
if is_torch_npu_available():
|
||||
torch.npu.manual_seed_all(seed)
|
||||
if is_tf_available():
|
||||
tf.random.set_seed(seed)
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ from .utils import (
|
||||
is_torch_bf16_cpu_available,
|
||||
is_torch_bf16_gpu_available,
|
||||
is_torch_neuroncore_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_tf32_available,
|
||||
is_torch_tpu_available,
|
||||
logging,
|
||||
@@ -1368,12 +1369,13 @@ class TrainingArguments:
|
||||
self.framework == "pt"
|
||||
and is_torch_available()
|
||||
and (self.device.type != "cuda")
|
||||
and (self.device.type != "npu")
|
||||
and (get_xla_device_type(self.device) != "GPU")
|
||||
and (self.fp16 or self.fp16_full_eval)
|
||||
):
|
||||
raise ValueError(
|
||||
"FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
|
||||
" (`--fp16_full_eval`) can only be used on CUDA devices."
|
||||
" (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
|
||||
)
|
||||
|
||||
if (
|
||||
@@ -1769,6 +1771,10 @@ class TrainingArguments:
|
||||
elif self.use_cpu:
|
||||
device = torch.device("cpu")
|
||||
self._n_gpu = 0
|
||||
elif is_torch_npu_available():
|
||||
device = torch.device("npu:0")
|
||||
torch.npu.set_device(device)
|
||||
self._n_gpu = 1
|
||||
else:
|
||||
# if n_gpu is > 1 we'll use nn.DataParallel.
|
||||
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
|
||||
|
||||
@@ -164,6 +164,7 @@ from .import_utils import (
|
||||
is_torch_fx_proxy,
|
||||
is_torch_mps_available,
|
||||
is_torch_neuroncore_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_tensorrt_fx_available,
|
||||
is_torch_tf32_available,
|
||||
is_torch_tpu_available,
|
||||
|
||||
@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
|
||||
return False
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def is_torch_npu_available(check_device=False):
|
||||
"Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
|
||||
if not _torch_available or importlib.util.find_spec("torch_npu") is None:
|
||||
return False
|
||||
|
||||
import torch
|
||||
import torch_npu # noqa: F401
|
||||
|
||||
if check_device:
|
||||
try:
|
||||
# Will raise a RuntimeError if no NPU is found
|
||||
_ = torch.npu.device_count()
|
||||
return torch.npu.is_available()
|
||||
except RuntimeError:
|
||||
return False
|
||||
return hasattr(torch, "npu") and torch.npu.is_available()
|
||||
|
||||
|
||||
def is_torchdynamo_available():
|
||||
if not is_torch_available():
|
||||
return False
|
||||
|
||||
@@ -21,6 +21,7 @@ from transformers.testing_utils import (
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_gpu,
|
||||
require_torch_neuroncore,
|
||||
require_torch_npu,
|
||||
)
|
||||
from transformers.training_args import ParallelMode
|
||||
from transformers.utils import logging
|
||||
@@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
|
||||
class TestTrainerDistributedNPU(TestCasePlus):
|
||||
@require_torch_npu
|
||||
def test_trainer(self):
|
||||
distributed_args = f"""--nproc_per_node=2
|
||||
--master_port={get_torch_dist_unique_port()}
|
||||
{self.test_file_dir}/test_trainer_distributed.py
|
||||
""".split()
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args = f"--output_dir {output_dir}".split()
|
||||
cmd = ["torchrun"] + distributed_args + args
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
|
||||
class TestTrainerDistributed(TestCasePlus):
|
||||
@require_torch_multi_gpu
|
||||
def test_trainer(self):
|
||||
|
||||
Reference in New Issue
Block a user