HPU support (#36424)

* test

* fix

* fix

* skip some and run some first

* test fsdp

* fix

* patches for generate

* test distributed

* copy

* don't test distributed loss for hpu

* require fp16 and run first

* changes from marc's PR fixing zero3

* better alternative

* return True when fp16 support on gaudi without creating bridge

* fix

* fix tested dtype in deepspeed inference test

* test

* fix

* test

* fix

* skip

* require fp16

* run first fsdp

* Apply suggestions from code review

* address comments

* address comments and refactor test

* reduce precison

* avoid doing gaudi1 specific stuff in the genreation loop

* document test_gradient_accumulation_loss_alignment_with_model_loss test a bit more
This commit is contained in:
Ilyas Moutawwakil
2025-03-12 09:08:12 +01:00
committed by GitHub
parent 50d3530aa0
commit 89f6956015
19 changed files with 337 additions and 139 deletions

View File

@@ -75,6 +75,7 @@ from transformers.testing_utils import (
require_intel_extension_for_pytorch,
require_liger_kernel,
require_lomo,
require_non_hpu,
require_non_xpu,
require_optuna,
require_peft,
@@ -88,6 +89,7 @@ from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torch_bf16,
require_torch_fp16,
require_torch_gpu,
require_torch_multi_accelerator,
require_torch_non_multi_accelerator,
@@ -98,6 +100,7 @@ from transformers.testing_utils import (
require_torchdynamo,
require_vision,
require_wandb,
run_first,
run_test_using_subprocess,
slow,
torch_device,
@@ -119,6 +122,13 @@ from transformers.utils import (
from transformers.utils.hp_naming import TrialShortNamer
if torch_device == "hpu":
RTOL = 1e-3
ATOL = 1e-3
else:
RTOL = 1e-5
ATOL = 1e-5
if is_torch_available():
import torch
from torch import nn
@@ -726,11 +736,11 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
trainer.train()
self.alternate_trained_model = (trainer.model.a, trainer.model.b)
def check_trained_model(self, model, alternate_seed=False):
def check_trained_model(self, model, alternate_seed=False, **kwargs):
# Checks a training seeded with learning_rate = 0.1
(a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
torch.testing.assert_close(model.a, a)
torch.testing.assert_close(model.b, b)
torch.testing.assert_close(model.a, a, **kwargs)
torch.testing.assert_close(model.b, b, **kwargs)
def test_reproducible_training(self):
# Checks that training worked, model trained and seed made a reproducible training.
@@ -812,11 +822,6 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained(model_name)
state_dict = model.state_dict()
base_loss_callback = StoreLossCallback()
args_kwargs = {
"report_to": "none",
"logging_steps": 1,
@@ -830,6 +835,10 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
tmp_dir,
**args_kwargs,
)
# train with base loss
set_seed(42)
model = AutoModelForCausalLM.from_pretrained(model_name)
base_loss_callback = StoreLossCallback()
trainer = Trainer(
model,
args,
@@ -840,16 +849,17 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
assert trainer.model_accepts_loss_kwargs
trainer.train()
grad_accum_loss_callback = StoreLossCallback()
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir,
**args_kwargs,
gradient_accumulation_steps=2,
per_device_train_batch_size=4,
)
# train with gradient accumulation
set_seed(42)
model = AutoModelForCausalLM.from_pretrained(model_name)
grad_accum_loss_callback = StoreLossCallback()
trainer = Trainer(
model,
args,
@@ -857,10 +867,12 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
callbacks=[grad_accum_loss_callback],
data_collator=data_collator,
)
assert trainer.model_accepts_loss_kwargs
trainer.train()
# train with broken loss
set_seed(42)
model.load_state_dict(state_dict)
model = AutoModelForCausalLM.from_pretrained(model_name)
broken_loss_callback = StoreLossCallback()
trainer = Trainer(
model,
@@ -869,30 +881,28 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
callbacks=[broken_loss_callback],
data_collator=data_collator,
)
# disable model_accepts_loss_kwargs
# disable model_accepts_loss_kwargs so that "num_items_in_batch" is not passed to the model
trainer.model_accepts_loss_kwargs = False
trainer.train()
# Calculate the difference between the base loss and the grad_accum loss
diff_truth = [
abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
]
diff_broken = [
abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)
]
# Calculate the difference between the base loss and the grad_accum loss
diff_truth = [
abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
]
diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)]
# all diff truth should be quite close
self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
# all diff truth should be quite close
self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
# max diff broken should be very off
self.assertGreater(max(diff_broken), 1.5, f"Difference {max(diff_broken)} is not greater than 2")
# max diff broken should be very off
self.assertGreater(max(diff_broken), 1.3, f"Difference {max(diff_broken)} is not greater than 1.3")
loss_base = sum(base_loss_callback.losses)
loss_broken = sum(broken_loss_callback.losses)
loss_base = sum(base_loss_callback.losses)
loss_broken = sum(broken_loss_callback.losses)
# mean/sum loss should not vary too much.
relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")
# mean/sum loss should not vary too much.
relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")
def test_gradient_accumulation_loss_alignment_with_loss_func(self):
set_seed(42)
@@ -1214,14 +1224,14 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
self.assertFalse(torch.allclose(trainer.model.b, b))
self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
@require_torch_accelerator
@require_torch_bf16
@require_torch_accelerator
def test_mixed_bf16(self):
# very basic test
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir)
trainer.train()
self.check_trained_model(trainer.model)
self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)
# --bf16 --half_precision_backend apex can't be used together
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -3582,6 +3592,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
)
@slow
@run_first
def test_trainer_eval_mrpc(self):
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3598,6 +3609,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertLess(result["eval_loss"], 0.2)
@slow
@run_first
def test_trainer_eval_multiple(self):
MODEL_ID = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3897,6 +3909,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir)
self.check_mem_metrics(trainer, self.assertNotIn)
@require_torch_fp16
@require_torch_accelerator
def test_fp16_full_eval(self):
# this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
@@ -4152,6 +4165,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
@slow
@require_non_hpu
@require_torch_multi_accelerator
def test_end_to_end_example(self):
# Tests that `translation.py` will run without issues

View File

@@ -19,12 +19,11 @@ import numpy as np
from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
from transformers.testing_utils import (
TestCasePlus,
backend_device_count,
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_gpu,
require_torch_multi_xpu,
require_torch_neuroncore,
require_torch_npu,
require_torch_multi_accelerator,
torch_device,
)
from transformers.training_args import ParallelMode
from transformers.utils import logging
@@ -117,38 +116,10 @@ if is_torch_available():
return result
class TestTrainerDistributedNeuronCore(TestCasePlus):
@require_torch_neuroncore
def test_trainer(self):
distributed_args = f"""--nproc_per_node=2
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributedNPU(TestCasePlus):
@require_torch_npu
def test_trainer(self):
distributed_args = f"""--nproc_per_node=2
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributed(TestCasePlus):
@require_torch_multi_gpu
@require_torch_multi_accelerator
def test_trainer(self):
distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
@@ -159,20 +130,6 @@ class TestTrainerDistributed(TestCasePlus):
# successful return here == success - any errors would have caused an error in the sub-call
@require_torch_multi_xpu
class TestTrainerDistributedXPU(TestCasePlus):
def test_trainer(self):
distributed_args = f"""--nproc_per_node={torch.xpu.device_count()}
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
if __name__ == "__main__":
# The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
#

View File

@@ -17,12 +17,15 @@ from typing import Dict
from transformers import is_torch_available
from transformers.testing_utils import (
TestCasePlus,
backend_device_count,
execute_subprocess_async,
get_torch_dist_unique_port,
require_accelerate,
require_fp8,
require_fsdp,
require_torch_multi_gpu,
require_torch_multi_accelerator,
run_first,
torch_device,
)
@@ -64,9 +67,10 @@ if is_torch_available():
class TestFSDPTrainer(TestCasePlus):
@require_torch_multi_accelerator
@require_accelerate
@require_torch_multi_gpu
@require_fsdp
@run_first
def test_trainer(self):
output_dir = self.get_auto_remove_tmp_dir()
cmd = [
@@ -76,7 +80,7 @@ class TestFSDPTrainer(TestCasePlus):
"--main_process_port",
f"{get_torch_dist_unique_port()}",
"--num_processes",
f"{torch.cuda.device_count()}",
f"{backend_device_count(torch_device)}",
"--fsdp_transformer_layer_cls_to_wrap",
"GPT2Block",
f"{self.test_file_dir}/test_trainer_fsdp.py",
@@ -90,10 +94,11 @@ class TestFSDPTrainer(TestCasePlus):
class TestFSDPTrainerFP8(TestCasePlus):
@require_torch_multi_accelerator
@require_accelerate
@require_torch_multi_gpu
@require_fsdp
@require_fp8
@run_first
def test_trainer(self):
output_dir = self.get_auto_remove_tmp_dir()
cmd = [
@@ -103,7 +108,7 @@ class TestFSDPTrainerFP8(TestCasePlus):
"--main_process_port",
f"{get_torch_dist_unique_port()}",
"--num_processes",
f"{torch.cuda.device_count()}",
f"{backend_device_count(torch_device)}",
"--mixed_precision",
"fp8",
"--fsdp_transformer_layer_cls_to_wrap",
@@ -117,32 +122,34 @@ class TestFSDPTrainerFP8(TestCasePlus):
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestFSDPTrainerWrap(TestCasePlus):
@require_accelerate
@require_torch_multi_gpu
@require_fsdp
def test_trainer(self):
output_dir = self.get_auto_remove_tmp_dir()
cmd = [
"accelerate",
"launch",
"--use_fsdp",
"--main_process_port",
f"{get_torch_dist_unique_port()}",
"--num_processes",
f"{torch.cuda.device_count()}",
"--fsdp_transformer_layer_cls_to_wrap",
"GPT2Block",
f"{self.test_file_dir}/test_trainer_fsdp.py",
"--output_dir",
f"{output_dir}",
"--report_to",
"none",
"--auto_find_batch_size",
"True",
]
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestFSDPTrainerWrap(TestCasePlus):
@require_torch_multi_accelerator
@require_accelerate
@require_fsdp
@run_first
def test_trainer(self):
output_dir = self.get_auto_remove_tmp_dir()
cmd = [
"accelerate",
"launch",
"--use_fsdp",
"--main_process_port",
f"{get_torch_dist_unique_port()}",
"--num_processes",
f"{backend_device_count(torch_device)}",
"--fsdp_transformer_layer_cls_to_wrap",
"GPT2Block",
f"{self.test_file_dir}/test_trainer_fsdp.py",
"--output_dir",
f"{output_dir}",
"--report_to",
"none",
"--auto_find_batch_size",
"True",
]
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
if __name__ == "__main__":