HPU support (#36424)

* test

* fix

* fix

* skip some and run some first

* test fsdp

* fix

* patches for generate

* test distributed

* copy

* don't test distributed loss for hpu

* require fp16 and run first

* changes from marc's PR fixing zero3

* better alternative

* return True when fp16 support on gaudi without creating bridge

* fix

* fix tested dtype in deepspeed inference test

* test

* fix

* test

* fix

* skip

* require fp16

* run first fsdp

* Apply suggestions from code review

* address comments

* address comments and refactor test

* reduce precison

* avoid doing gaudi1 specific stuff in the genreation loop

* document test_gradient_accumulation_loss_alignment_with_model_loss test a bit more
This commit is contained in:
Ilyas Moutawwakil
2025-03-12 09:08:12 +01:00
committed by GitHub
parent 50d3530aa0
commit 89f6956015
19 changed files with 337 additions and 139 deletions

View File

@@ -19,12 +19,11 @@ import numpy as np
from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
from transformers.testing_utils import (
TestCasePlus,
backend_device_count,
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_gpu,
require_torch_multi_xpu,
require_torch_neuroncore,
require_torch_npu,
require_torch_multi_accelerator,
torch_device,
)
from transformers.training_args import ParallelMode
from transformers.utils import logging
@@ -117,38 +116,10 @@ if is_torch_available():
return result
class TestTrainerDistributedNeuronCore(TestCasePlus):
@require_torch_neuroncore
def test_trainer(self):
distributed_args = f"""--nproc_per_node=2
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributedNPU(TestCasePlus):
@require_torch_npu
def test_trainer(self):
distributed_args = f"""--nproc_per_node=2
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributed(TestCasePlus):
@require_torch_multi_gpu
@require_torch_multi_accelerator
def test_trainer(self):
distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
@@ -159,20 +130,6 @@ class TestTrainerDistributed(TestCasePlus):
# successful return here == success - any errors would have caused an error in the sub-call
@require_torch_multi_xpu
class TestTrainerDistributedXPU(TestCasePlus):
def test_trainer(self):
distributed_args = f"""--nproc_per_node={torch.xpu.device_count()}
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
if __name__ == "__main__":
# The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
#