add ascend npu accelerator support (#24879)
* Add Ascend NPU accelerator support * fix style warining
This commit is contained in:
@@ -721,6 +721,7 @@ _import_structure = {
|
|||||||
"is_tokenizers_available",
|
"is_tokenizers_available",
|
||||||
"is_torch_available",
|
"is_torch_available",
|
||||||
"is_torch_neuroncore_available",
|
"is_torch_neuroncore_available",
|
||||||
|
"is_torch_npu_available",
|
||||||
"is_torch_tpu_available",
|
"is_torch_tpu_available",
|
||||||
"is_torchvision_available",
|
"is_torchvision_available",
|
||||||
"is_vision_available",
|
"is_vision_available",
|
||||||
@@ -4643,6 +4644,7 @@ if TYPE_CHECKING:
|
|||||||
is_tokenizers_available,
|
is_tokenizers_available,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_torch_neuroncore_available,
|
is_torch_neuroncore_available,
|
||||||
|
is_torch_npu_available,
|
||||||
is_torch_tpu_available,
|
is_torch_tpu_available,
|
||||||
is_torchvision_available,
|
is_torchvision_available,
|
||||||
is_vision_available,
|
is_vision_available,
|
||||||
|
|||||||
@@ -91,6 +91,7 @@ from .utils import (
|
|||||||
is_torch_bf16_cpu_available,
|
is_torch_bf16_cpu_available,
|
||||||
is_torch_bf16_gpu_available,
|
is_torch_bf16_gpu_available,
|
||||||
is_torch_neuroncore_available,
|
is_torch_neuroncore_available,
|
||||||
|
is_torch_npu_available,
|
||||||
is_torch_tensorrt_fx_available,
|
is_torch_tensorrt_fx_available,
|
||||||
is_torch_tf32_available,
|
is_torch_tf32_available,
|
||||||
is_torch_tpu_available,
|
is_torch_tpu_available,
|
||||||
@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def require_torch_npu(test_case):
|
||||||
|
"""
|
||||||
|
Decorator marking a test that requires NPU (in PyTorch).
|
||||||
|
"""
|
||||||
|
return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
|
||||||
|
|
||||||
|
|
||||||
|
def require_torch_multi_npu(test_case):
|
||||||
|
"""
|
||||||
|
Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
|
||||||
|
multiple NPUs.
|
||||||
|
|
||||||
|
To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
|
||||||
|
"""
|
||||||
|
if not is_torch_npu_available():
|
||||||
|
return unittest.skip("test requires PyTorch NPU")(test_case)
|
||||||
|
|
||||||
|
return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
# Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
|
# Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ from .utils import (
|
|||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_torch_cuda_available,
|
is_torch_cuda_available,
|
||||||
is_torch_mps_available,
|
is_torch_mps_available,
|
||||||
|
is_torch_npu_available,
|
||||||
is_torch_tpu_available,
|
is_torch_tpu_available,
|
||||||
requires_backends,
|
requires_backends,
|
||||||
)
|
)
|
||||||
@@ -94,6 +95,8 @@ def set_seed(seed: int):
|
|||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
torch.cuda.manual_seed_all(seed)
|
torch.cuda.manual_seed_all(seed)
|
||||||
# ^^ safe to call this function even if cuda is not available
|
# ^^ safe to call this function even if cuda is not available
|
||||||
|
if is_torch_npu_available():
|
||||||
|
torch.npu.manual_seed_all(seed)
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
tf.random.set_seed(seed)
|
tf.random.set_seed(seed)
|
||||||
|
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ from .utils import (
|
|||||||
is_torch_bf16_cpu_available,
|
is_torch_bf16_cpu_available,
|
||||||
is_torch_bf16_gpu_available,
|
is_torch_bf16_gpu_available,
|
||||||
is_torch_neuroncore_available,
|
is_torch_neuroncore_available,
|
||||||
|
is_torch_npu_available,
|
||||||
is_torch_tf32_available,
|
is_torch_tf32_available,
|
||||||
is_torch_tpu_available,
|
is_torch_tpu_available,
|
||||||
logging,
|
logging,
|
||||||
@@ -1368,12 +1369,13 @@ class TrainingArguments:
|
|||||||
self.framework == "pt"
|
self.framework == "pt"
|
||||||
and is_torch_available()
|
and is_torch_available()
|
||||||
and (self.device.type != "cuda")
|
and (self.device.type != "cuda")
|
||||||
|
and (self.device.type != "npu")
|
||||||
and (get_xla_device_type(self.device) != "GPU")
|
and (get_xla_device_type(self.device) != "GPU")
|
||||||
and (self.fp16 or self.fp16_full_eval)
|
and (self.fp16 or self.fp16_full_eval)
|
||||||
):
|
):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
|
"FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
|
||||||
" (`--fp16_full_eval`) can only be used on CUDA devices."
|
" (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -1769,6 +1771,10 @@ class TrainingArguments:
|
|||||||
elif self.use_cpu:
|
elif self.use_cpu:
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
self._n_gpu = 0
|
self._n_gpu = 0
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
device = torch.device("npu:0")
|
||||||
|
torch.npu.set_device(device)
|
||||||
|
self._n_gpu = 1
|
||||||
else:
|
else:
|
||||||
# if n_gpu is > 1 we'll use nn.DataParallel.
|
# if n_gpu is > 1 we'll use nn.DataParallel.
|
||||||
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
|
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
|
||||||
|
|||||||
@@ -164,6 +164,7 @@ from .import_utils import (
|
|||||||
is_torch_fx_proxy,
|
is_torch_fx_proxy,
|
||||||
is_torch_mps_available,
|
is_torch_mps_available,
|
||||||
is_torch_neuroncore_available,
|
is_torch_neuroncore_available,
|
||||||
|
is_torch_npu_available,
|
||||||
is_torch_tensorrt_fx_available,
|
is_torch_tensorrt_fx_available,
|
||||||
is_torch_tf32_available,
|
is_torch_tf32_available,
|
||||||
is_torch_tpu_available,
|
is_torch_tpu_available,
|
||||||
|
|||||||
@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def is_torch_npu_available(check_device=False):
|
||||||
|
"Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
|
||||||
|
if not _torch_available or importlib.util.find_spec("torch_npu") is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch_npu # noqa: F401
|
||||||
|
|
||||||
|
if check_device:
|
||||||
|
try:
|
||||||
|
# Will raise a RuntimeError if no NPU is found
|
||||||
|
_ = torch.npu.device_count()
|
||||||
|
return torch.npu.is_available()
|
||||||
|
except RuntimeError:
|
||||||
|
return False
|
||||||
|
return hasattr(torch, "npu") and torch.npu.is_available()
|
||||||
|
|
||||||
|
|
||||||
def is_torchdynamo_available():
|
def is_torchdynamo_available():
|
||||||
if not is_torch_available():
|
if not is_torch_available():
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from transformers.testing_utils import (
|
|||||||
get_torch_dist_unique_port,
|
get_torch_dist_unique_port,
|
||||||
require_torch_multi_gpu,
|
require_torch_multi_gpu,
|
||||||
require_torch_neuroncore,
|
require_torch_neuroncore,
|
||||||
|
require_torch_npu,
|
||||||
)
|
)
|
||||||
from transformers.training_args import ParallelMode
|
from transformers.training_args import ParallelMode
|
||||||
from transformers.utils import logging
|
from transformers.utils import logging
|
||||||
@@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
|
|||||||
# successful return here == success - any errors would have caused an error in the sub-call
|
# successful return here == success - any errors would have caused an error in the sub-call
|
||||||
|
|
||||||
|
|
||||||
|
class TestTrainerDistributedNPU(TestCasePlus):
|
||||||
|
@require_torch_npu
|
||||||
|
def test_trainer(self):
|
||||||
|
distributed_args = f"""--nproc_per_node=2
|
||||||
|
--master_port={get_torch_dist_unique_port()}
|
||||||
|
{self.test_file_dir}/test_trainer_distributed.py
|
||||||
|
""".split()
|
||||||
|
output_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
args = f"--output_dir {output_dir}".split()
|
||||||
|
cmd = ["torchrun"] + distributed_args + args
|
||||||
|
execute_subprocess_async(cmd, env=self.get_env())
|
||||||
|
# successful return here == success - any errors would have caused an error in the sub-call
|
||||||
|
|
||||||
|
|
||||||
class TestTrainerDistributed(TestCasePlus):
|
class TestTrainerDistributed(TestCasePlus):
|
||||||
@require_torch_multi_gpu
|
@require_torch_multi_gpu
|
||||||
def test_trainer(self):
|
def test_trainer(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user