add ascend npu accelerator support (#24879)

* Add Ascend NPU accelerator support * fix style warining
2023-07-18 20:20:32 +08:00
parent f14c7f999d
commit 9c875839c0
7 changed files with 68 additions and 1 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -721,6 +721,7 @@ _import_structure = {
        "is_tokenizers_available",
        "is_torch_available",
        "is_torch_neuroncore_available",
        "is_torch_npu_available",
        "is_torch_tpu_available",
        "is_torchvision_available",
        "is_vision_available",
@@ -4643,6 +4644,7 @@ if TYPE_CHECKING:
        is_tokenizers_available,
        is_torch_available,
        is_torch_neuroncore_available,
        is_torch_npu_available,
        is_torch_tpu_available,
        is_torchvision_available,
        is_vision_available,
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -91,6 +91,7 @@ from .utils import (
    is_torch_bf16_cpu_available,
    is_torch_bf16_gpu_available,
    is_torch_neuroncore_available,
    is_torch_npu_available,
    is_torch_tensorrt_fx_available,
    is_torch_tf32_available,
    is_torch_tpu_available,
@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
    )
 def require_torch_npu(test_case):
    """
    Decorator marking a test that requires NPU (in PyTorch).
    """
    return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
 def require_torch_multi_npu(test_case):
    """
    Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
    multiple NPUs.
    To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
    """
    if not is_torch_npu_available():
        return unittest.skip("test requires PyTorch NPU")(test_case)
    return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
 if is_torch_available():
    # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
    import torch
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -36,6 +36,7 @@ from .utils import (
    is_torch_available,
    is_torch_cuda_available,
    is_torch_mps_available,
    is_torch_npu_available,
    is_torch_tpu_available,
    requires_backends,
 )
@@ -94,6 +95,8 @@ def set_seed(seed: int):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_torch_npu_available():
        torch.npu.manual_seed_all(seed)
    if is_tf_available():
        tf.random.set_seed(seed)
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -47,6 +47,7 @@ from .utils import (
    is_torch_bf16_cpu_available,
    is_torch_bf16_gpu_available,
    is_torch_neuroncore_available,
    is_torch_npu_available,
    is_torch_tf32_available,
    is_torch_tpu_available,
    logging,
@@ -1368,12 +1369,13 @@ class TrainingArguments:
            self.framework == "pt"
            and is_torch_available()
            and (self.device.type != "cuda")
            and (self.device.type != "npu")
            and (get_xla_device_type(self.device) != "GPU")
            and (self.fp16 or self.fp16_full_eval)
        ):
            raise ValueError(
                "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
-                " (`--fp16_full_eval`) can only be used on CUDA devices."
+                " (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
            )
        if (
@@ -1769,6 +1771,10 @@ class TrainingArguments:
            elif self.use_cpu:
                device = torch.device("cpu")
                self._n_gpu = 0
            elif is_torch_npu_available():
                device = torch.device("npu:0")
                torch.npu.set_device(device)
                self._n_gpu = 1
            else:
                # if n_gpu is > 1 we'll use nn.DataParallel.
                # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
--- a/src/transformers/utils/init.py
+++ b/src/transformers/utils/init.py
@@ -164,6 +164,7 @@ from .import_utils import (
    is_torch_fx_proxy,
    is_torch_mps_available,
    is_torch_neuroncore_available,
    is_torch_npu_available,
    is_torch_tensorrt_fx_available,
    is_torch_tf32_available,
    is_torch_tpu_available,
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
    return False
@lru_cache()
 def is_torch_npu_available(check_device=False):
    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
        return False
    import torch
    import torch_npu  # noqa: F401
    if check_device:
        try:
            # Will raise a RuntimeError if no NPU is found
            _ = torch.npu.device_count()
            return torch.npu.is_available()
        except RuntimeError:
            return False
    return hasattr(torch, "npu") and torch.npu.is_available()
 def is_torchdynamo_available():
    if not is_torch_available():
        return False
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -21,6 +21,7 @@ from transformers.testing_utils import (
    get_torch_dist_unique_port,
    require_torch_multi_gpu,
    require_torch_neuroncore,
    require_torch_npu,
 )
 from transformers.training_args import ParallelMode
 from transformers.utils import logging
@@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
        # successful return here == success - any errors would have caused an error in the sub-call
 class TestTrainerDistributedNPU(TestCasePlus):
    @require_torch_npu
    def test_trainer(self):
        distributed_args = f"""--nproc_per_node=2
            --master_port={get_torch_dist_unique_port()}
            {self.test_file_dir}/test_trainer_distributed.py
        """.split()
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"--output_dir {output_dir}".split()
        cmd = ["torchrun"] + distributed_args + args
        execute_subprocess_async(cmd, env=self.get_env())
        # successful return here == success - any errors would have caused an error in the sub-call
 class TestTrainerDistributed(TestCasePlus):
    @require_torch_multi_gpu
    def test_trainer(self):