Patch with accelerate xpu (#25714)

* patch with accelerate xpu * patch with accelerate xpu * formatting * fix tests * revert ruff unrelated fixes * revert ruff unrelated fixes * revert ruff unrelated fixes * fix test * review fixes * review fixes * black fixed * review commits * review commits * style fix * use pytorch_utils * revert markuplm test
2023-09-05 20:11:42 +05:30
parent aa5c94d38d
commit 70a98024b1
7 changed files with 127 additions and 14 deletions
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -22,6 +22,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_gpu,
+    require_torch_multi_xpu,
    require_torch_neuroncore,
    require_torch_npu,
 )
@@ -158,6 +159,20 @@ class TestTrainerDistributed(TestCasePlus):
        # successful return here == success - any errors would have caused an error in the sub-call


+@require_torch_multi_xpu
+class TestTrainerDistributedXPU(TestCasePlus):
+    def test_trainer(self):
+        distributed_args = f"""--nproc_per_node={torch.xpu.device_count()}
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
 if __name__ == "__main__":
    # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
    #